[llvm] 88f1a2c - [X86] combineLoad - allow constant loads to share matching 'lower constant bits' with larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD nodes

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 14 09:31:36 PST 2023


Author: Simon Pilgrim
Date: 2023-12-14T17:31:07Z
New Revision: 88f1a2c50d722fe0e444b3e4d8f41524b2c9170d

URL: https://github.com/llvm/llvm-project/commit/88f1a2c50d722fe0e444b3e4d8f41524b2c9170d
DIFF: https://github.com/llvm/llvm-project/commit/88f1a2c50d722fe0e444b3e4d8f41524b2c9170d.diff

LOG: [X86] combineLoad - allow constant loads to share matching 'lower constant bits' with larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD nodes

We already had separate support for VBROADCAST_LOAD - merge this with the generic load handling and add SUBV_BROADCAST_LOAD support as well.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 075a49df9da9a5..cb117475dbe4cf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49963,33 +49963,15 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
           }
           return true;
         };
-        if (User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
-            getTargetConstantFromBasePtr(Ptr)) {
-          // See if we are loading a constant that has also been broadcast.
-          APInt Undefs, UserUndefs;
-          SmallVector<APInt> Bits, UserBits;
-          if (getTargetConstantBitsFromNode(SDValue(N, 0), 8, Undefs, Bits) &&
-              getTargetConstantBitsFromNode(SDValue(User, 0), 8, UserUndefs,
-                                            UserBits)) {
-            UserUndefs = UserUndefs.trunc(Undefs.getBitWidth());
-            UserBits.truncate(Bits.size());
-            if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
-              SDValue Extract = extractSubVector(
-                  SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
-              Extract = DAG.getBitcast(RegVT, Extract);
-              return DCI.CombineTo(N, Extract, SDValue(User, 1));
-            }
-          }
-        }
-        if (ISD::isNormalLoad(User)) {
-          // See if we are loading a constant that matches in the lower
-          // bits of a longer constant (but from a 
diff erent constant pool ptr).
-          SDValue UserPtr = cast<MemSDNode>(User)->getBasePtr();
-          const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
-          const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
-          if (LdC && UserC && UserPtr != Ptr &&
-              LdC->getType()->getPrimitiveSizeInBits() <
-                  UserC->getType()->getPrimitiveSizeInBits()) {
+        // See if we are loading a constant that matches in the lower
+        // bits of a longer constant (but from a 
diff erent constant pool ptr).
+        SDValue UserPtr = cast<MemSDNode>(User)->getBasePtr();
+        const Constant *LdC = getTargetConstantFromBasePtr(Ptr);
+        const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
+        if (LdC && UserC && UserPtr != Ptr) {
+          unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
+          unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
+          if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
             APInt Undefs, UserUndefs;
             SmallVector<APInt> Bits, UserBits;
             if (getTargetConstantBitsFromNode(SDValue(N, 0), 8, Undefs, Bits) &&

diff  --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 9aa90de654a443..9125f2492ebb80 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -1459,17 +1459,16 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
 ;
 ; AVX-64-LABEL: f8xi64_i128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
-; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX-64-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
-; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1]
 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX-64-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX-64-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
+; AVX-64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index c7fca67c75aea5..78065bc73c1d3f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -3842,117 +3842,108 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm8, %zmm10
 ; AVX512BW-NEXT:    movb $-64, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm9
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm9, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm10
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm9, %zmm10
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm10, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm10, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm11
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm10, %zmm11
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm11, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm11, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
-; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm12
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm11, %zmm12
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
 ; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm12, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm12, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm13
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm12, %zmm13
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm13, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm14
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm13, %zmm14
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm16
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm14, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
-; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm15
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm14, %zmm15
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm15, %zmm6
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm15, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm15, %zmm1
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
@@ -8746,282 +8737,220 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512BW-LABEL: load_i16_stride8_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $1096, %rsp # imm = 0x448
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm8, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm15
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
+; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm13, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm13, %zmm18
+; AVX512BW-NEXT:    movb $-64, %dil
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm13, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm13, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm13, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm13, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm13, %zmm8
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm8, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm8, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm8, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm8, %zmm4
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
 ; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm4, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm4, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm4, %zmm10
+; AVX512BW-NEXT:    vpermi2w %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm24
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm0, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm27
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm3, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm31
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm10, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm30
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm15, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm22
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm13, %zmm22
-; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm1, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm3, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm10, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm15, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm4, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm19
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm3, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm10, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm20
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm15, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm17
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm13, %zmm17
-; AVX512BW-NEXT:    vpermt2w %zmm21, %zmm1, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm11
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm0, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm6, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm21
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm10, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm15, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm15
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm1, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermt2w %zmm28, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm11 {%k1}
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm2, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm3, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm6, %zmm8
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm9, %zmm11
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
-; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm12, %zmm14
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
-; AVX512BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm19, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm2, %zmm9
-; AVX512BW-NEXT:    vpermi2w %zmm28, %zmm2, %zmm12
-; AVX512BW-NEXT:    vpermt2w %zmm28, %zmm19, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm31, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm21 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm21, %zmm6
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm30, %zmm10
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm7 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm22, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm26, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm29, %zmm11
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm10, %zmm15
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm10, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
+; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm10, %zmm2
+; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm10, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm10, %zmm6
+; AVX512BW-NEXT:    vpermt2w %zmm31, %zmm10, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm10, %zmm1
+; AVX512BW-NEXT:    vpermt2w %zmm29, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%r11)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, (%r11)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 64(%r10)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512BW-NEXT:    addq $1096, %rsp # imm = 0x448
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <512 x i16>, ptr %in.vec, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index 1bd7025307d2b9..faa1642831c155 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -1617,117 +1617,108 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm4
 ; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm7
 ; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm6
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm9
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
 ; AVX512F-NEXT:    movb $-64, %dil
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm9
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25]
-; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm10
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
 ; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26]
-; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm11
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
 ; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27]
-; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm12
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
 ; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28]
-; AVX512F-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm13
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
 ; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm14
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm16
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm15
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
+; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
 ; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm15, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm5
-; AVX512F-NEXT:    vpermt2d %zmm1, %zmm15, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
+; AVX512F-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rsi)
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rcx)
@@ -1745,117 +1736,108 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm8, %zmm10
 ; AVX512BW-NEXT:    movb $-64, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm9
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm9, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm10
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm9, %zmm10
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm11
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm11
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm12, %zmm10
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm11, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm11, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm12
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm11, %zmm12
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
 ; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm13
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm14, %zmm12
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm13, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm14
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm13, %zmm14
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm16
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm14, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm15
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm14, %zmm15
+; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm15, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm15, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm15, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
@@ -3689,563 +3671,439 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i32_stride8_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $1096, %rsp # imm = 0x448
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm29
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT:    movb $-64, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm11, %zmm0
-; AVX512F-NEXT:    vpermi2d %zmm6, %zmm8, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm15
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
+; AVX512F-NEXT:    movb $-64, %dil
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
 ; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
-; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm4, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
+; AVX512F-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
 ; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm15, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm1, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm27
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm3, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm31
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm10, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm30
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm15, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm22
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm13, %zmm22
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm1, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm3, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm10, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm15, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm4, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm19
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm3, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm10, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm20
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm15, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm13, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm1, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm4, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm6, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm21
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm10, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm15, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm15
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm1, %zmm29
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm0, %zmm4
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm0, %zmm3
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm0, %zmm6
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm0, %zmm10
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm0, %zmm13
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
-; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm2, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm18
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm9, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm11 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm2, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27]
-; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm9
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm3, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm14 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm2, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28]
-; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm6, %zmm8
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm19, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm2, %zmm6
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm2, %zmm9
-; AVX512F-NEXT:    vpermi2d %zmm28, %zmm2, %zmm12
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm19, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm31, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm21 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm21, %zmm6
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm30, %zmm10
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm7 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm22, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k1}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm26, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm29, %zmm11
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
+; AVX512F-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, (%rsi)
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 64(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, (%r9)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovdqa64 %zmm19, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 64(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm26, 64(%r11)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, (%r11)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, 64(%r10)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512F-NEXT:    addq $1096, %rsp # imm = 0x448
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride8_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $1096, %rsp # imm = 0x448
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm8, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm15
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
+; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm13, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm13, %zmm18
+; AVX512BW-NEXT:    movb $-64, %dil
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm13, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm13, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm18, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm13, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm13, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm13, %zmm8
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm17
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm8, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm8, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm8, %zmm4
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm19
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
 ; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm4, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm21
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm23
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm25
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm8
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm27
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm4, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm4, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm4, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm4, %zmm10
+; AVX512BW-NEXT:    vpermi2d %zmm29, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm27
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm3, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm31
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm10, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm30
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm15, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm22
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm13, %zmm22
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm1, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm3, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm10, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm15, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm4, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm19
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm3, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm10, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm20
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm15, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm13, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm1, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm6, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm21
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm10, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm15, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm15
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm1, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm11 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm2, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm3, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm6, %zmm8
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm9, %zmm11
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm12, %zmm14
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm19, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm2, %zmm9
-; AVX512BW-NEXT:    vpermi2d %zmm28, %zmm2, %zmm12
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm19, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm31, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm21 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm21, %zmm6
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm30, %zmm10
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm7 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm22, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm26, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm29, %zmm11
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm10, %zmm15
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm10, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k1}
+; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm10, %zmm2
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm10, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm10, %zmm6
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm10, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm10, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rsi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%r11)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, (%r11)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 64(%r10)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rax)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
-; AVX512BW-NEXT:    addq $1096, %rsp # imm = 0x448
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <256 x i32>, ptr %in.vec, align 64
@@ -8127,1227 +7985,1195 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i32_stride8_vf64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $3304, %rsp # imm = 0xCE8
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm7
-; AVX512F-NEXT:    vmovaps 1152(%rdi), %zmm0
+; AVX512F-NEXT:    subq $3144, %rsp # imm = 0xC48
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512F-NEXT:    vmovaps 1536(%rdi), %zmm0
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm24
+; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovaps 1664(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm23
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm30
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm18
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm27
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm21
+; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm26
+; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm22
+; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm19
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm27
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm16
+; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512F-NEXT:    movb $-64, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm29
-; AVX512F-NEXT:    vmovdqu64 %zmm19, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm31
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm19
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm15
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm3
-; AVX512F-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm26
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm9
-; AVX512F-NEXT:    vpermi2d %zmm26, %zmm9, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm24
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512F-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm8
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm7, %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm14
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm1, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm31
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2d %zmm25, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2d %zmm26, %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
+; AVX512F-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm26
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm4
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm24
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm23
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm25, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm1, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm12, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm10, %zmm3, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm16
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm29
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm7
+; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm25
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm15
+; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm24
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm14, %zmm0, %zmm15
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512F-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm29
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm16
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512F-NEXT:    vpermt2d %zmm25, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm22
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm21
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm21
-; AVX512F-NEXT:    vpermt2d %zmm12, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm12, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm8, %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm9
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm18
+; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512F-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm11
+; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm31
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm19
+; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm26
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm29
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512F-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm16
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm15
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512F-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm17
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512F-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vpermt2d %zmm31, %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm22
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vpermi2d %zmm12, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm8, %zmm2, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm30, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm12
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm7, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm19, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm26
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm9, %zmm2, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm16, %zmm2, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm9
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm7, %zmm9
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm30
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm10, %zmm30
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm2, %zmm21
-; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm25
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm7, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm31
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm10, %zmm31
-; AVX512F-NEXT:    vpermt2d %zmm28, %zmm2, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm10, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm24, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm7, %zmm18
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm30
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
+; AVX512F-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512F-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
+; AVX512F-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm31
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
+; AVX512F-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
 ; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm16
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm10, %zmm16
-; AVX512F-NEXT:    vpermt2d %zmm15, %zmm2, %zmm19
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm11
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm7, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm10, %zmm17
-; AVX512F-NEXT:    vpermt2d %zmm27, %zmm2, %zmm23
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm7, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm10, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm21
-; AVX512F-NEXT:    vpermt2d %zmm6, %zmm7, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm28
-; AVX512F-NEXT:    vpermt2d %zmm6, %zmm10, %zmm28
-; AVX512F-NEXT:    vpermt2d %zmm6, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm7, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
-; AVX512F-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm29
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
+; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512F-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
+; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm15
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm3, %zmm10, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm10, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm6
-; AVX512F-NEXT:    vpermt2d %zmm26, %zmm10, %zmm8
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
-; AVX512F-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2d %zmm3, %zmm2, %zmm27
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm26
-; AVX512F-NEXT:    vpermt2d %zmm6, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm25 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29]
-; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm0, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2d %zmm14, %zmm1, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm22, %zmm21
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
-; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm3, %zmm7
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31]
-; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2d %zmm2, %zmm11, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2d %zmm4, %zmm11, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm3, %zmm2
-; AVX512F-NEXT:    vpermi2d %zmm14, %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2d %zmm14, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512F-NEXT:    vpermt2d %zmm22, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm5 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm6
-; AVX512F-NEXT:    vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm31, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm24 {%k1}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm24
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm20 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm20, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm23 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm23, %zmm4
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
+; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
+; AVX512F-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
+; AVX512F-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm20
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
+; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
+; AVX512F-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm14
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
+; AVX512F-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm29, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm26 {%k1}
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm26, %zmm8
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 192(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, (%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 192(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 192(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, (%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 192(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, (%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
+; AVX512F-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 192(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 128(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 64(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, (%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 192(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, (%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 64(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 128(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 192(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, (%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 64(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 128(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 192(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, (%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 64(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 128(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 192(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, (%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 64(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 128(%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 128(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 64(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm6, 128(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 128(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512F-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-NEXT:    addq $3144, %rsp # imm = 0xC48
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride8_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $3304, %rsp # imm = 0xCE8
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovaps 1152(%rdi), %zmm0
+; AVX512BW-NEXT:    subq $3144, %rsp # imm = 0xC48
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovaps 1536(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm24
+; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovaps 1664(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm21
+; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm26
+; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16
+; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm4
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm19
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm3
-; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm9
-; AVX512BW-NEXT:    vpermi2d %zmm26, %zmm9, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm24
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm8
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm1, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm31
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2d %zmm25, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2d %zmm26, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm26
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm4
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm10
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm16
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm29
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm7
+; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm25
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm15
+; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm24
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm15
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512BW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm21
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm21
-; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm12, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm18
+; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermi2d %zmm19, %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm11
+; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm19
+; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm26
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm29
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2d %zmm24, %zmm5, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm17
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm15
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm27
+; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512BW-NEXT:    vpermt2d %zmm29, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpermt2d %zmm31, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm22
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpermi2d %zmm12, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm5, %zmm0
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm12
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm7, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm19, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm26
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm2, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm7, %zmm9
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm30
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm30
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm25
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm7, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm31
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm10, %zmm31
-; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm2, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm10, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm24, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm7, %zmm18
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm30
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm30
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm1, %zmm11
+; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm0, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm1, %zmm9
+; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm0, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm31
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm1, %zmm31
+; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm16
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm10, %zmm16
-; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm11
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm7, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm10, %zmm17
-; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm2, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm7, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm10, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm21
-; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm7, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm28
-; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm10, %zmm28
-; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm7, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm24
-; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm29
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm1, %zmm29
+; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm10, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm10, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm6
-; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm2, %zmm27
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm26
-; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm25 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2d %zmm14, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm22, %zmm21
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30]
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm23
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm7
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31]
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm11, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm11, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermi2d %zmm14, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm11, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm5 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm6
-; AVX512BW-NEXT:    vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm31, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm24 {%k1}
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm23
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm24
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm20, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm23, %zmm4
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm24
+; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm13
+; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm20
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm1, %zmm20
+; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm1, %zmm25
+; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm0, %zmm22
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm14
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm29, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm26, %zmm8
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 192(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, (%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 192(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, (%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 192(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, (%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 192(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, (%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%r9)
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm24 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm11 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm11, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm29 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm25 {%k1}
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm28 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm26 {%k1}
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm26, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm27 {%k1}
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm27, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm7
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 192(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 128(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 64(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, (%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 192(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, (%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 128(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 192(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, (%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 128(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 192(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, (%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 64(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 128(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 192(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, (%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 64(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 128(%r9)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 128(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 192(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, (%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm6, 128(%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT:    addq $3304, %rsp # imm = 0xCE8
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-NEXT:    addq $3144, %rsp # imm = 0xC48
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <512 x i32>, ptr %in.vec, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 4a2c38168bdf42..ccc15a4119ba45 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -520,118 +520,116 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-LABEL: load_i64_stride5_vf8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm4
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [12,1,6,0,12,1,6,0]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15]
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm5, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,6,11,u>
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm1, %zmm5
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,10,15,0,5,10,15,0]
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0]
+; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm7
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,5,10,15]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,6,11,u>
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm7, %zmm5
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,7,12,u>
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm1, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm8, %zmm7
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <11,0,5,u>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <11,0,5,u>
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm9, %zmm8
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <12,1,6,u>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm6, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rcx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, (%r9)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride5_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [12,1,6,0,12,1,6,0]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm4, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15]
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,6,11,u>
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm5
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,10,15,0,5,10,15,0]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm7
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11]
+; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,5,10,15]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,6,11,u>
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
 ; AVX512BW-NEXT:    movb $7, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,12]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm5
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,7,12,u>
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm7
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <11,0,5,u>
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm7
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <11,0,5,u>
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm8
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11]
-; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <12,1,6,u>
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,10,15]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <40 x i64>, ptr %in.vec, align 64
@@ -1183,90 +1181,90 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm4
 ; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm9
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm9
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [12,1,6,0,12,1,6,0]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm10, %zmm7
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm10
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0]
+; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm8
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,5,10,15]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm13
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm13, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm13, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm7, %zmm14
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm10
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,6,11,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm14, %zmm15
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm13, %zmm12
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm12
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <1,6,11,u>
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm15, %zmm16
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0]
+; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm14, %zmm13
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm12
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm13
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [6,11,0,1,6,11,0,1]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm16, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12]
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm16, %zmm13
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm14 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm14
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm17, %zmm18
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13]
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm16, %zmm18
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm15
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm17
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm16 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <11,0,5,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm18, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm17
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm19, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14]
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm16, %zmm20
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm15
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm19
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm19
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm18 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm18
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,0,11,0,5,0,11]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm14, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <12,1,6,u>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm14, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm19
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm15, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15]
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm15, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm8, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, (%r8)
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm9, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, (%r8)
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, 64(%r9)
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512F-NEXT:    vzeroupper
@@ -1280,90 +1278,90 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm9
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [12,1,6,0,12,1,6,0]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm10, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm10
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [12,1,6,0,12,1,6,0]
+; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm8
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,5,10,15]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm13
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm13, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm13, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm14
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,6,11,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm15
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm12
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm12
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <1,6,11,u>
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm16
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm13
 ; AVX512BW-NEXT:    movb $7, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,12]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm12
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm13
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [6,11,0,1,6,11,0,1]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm13
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm14
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [6,11,0,1,6,11,0,1]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm18
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm18
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm15
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,8,13]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm17
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <11,0,5,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm18, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm17
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,12,0,2,7,12,0,2]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm19, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm20
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm15
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,9,14]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm19
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm18
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,0,11,0,5,0,11]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <12,1,6,u>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm19
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 64(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%r8)
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm9, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%r8)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%r9)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%r9)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2559,397 +2557,403 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i64_stride5_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $584, %rsp # imm = 0x248
+; AVX512F-NEXT:    subq $648, %rsp # imm = 0x288
 ; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm21
 ; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm18
 ; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm17
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm19
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm27
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm25
 ; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0]
-; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0]
+; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm26, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm11, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm24, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm26, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm11, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm24, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm26, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm11, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm1, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm28, %zmm0
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm15, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm27
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm28, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm12, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm12, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm21, %zmm12
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm21, %zmm15
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1]
 ; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <2,7,12,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm15, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm23, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm23, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm21, %zmm23
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2]
 ; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm29, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm29, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm29, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm21, %zmm29
 ; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u>
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm21, %zmm20
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <12,1,6,u>
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm9, %zmm13
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm24, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm28, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm15, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm21, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm14
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm13, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm31, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm11, %zmm17
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm24, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm28, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm13, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm31, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm11, %zmm18
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm16
 ; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm24, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm24, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm0
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm28, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm28, %zmm7
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm28
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm25
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm13, %zmm4
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm31, %zmm5
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm16
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm11
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm24, %zmm11
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm24, %zmm10
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm24, %zmm3
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm24, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm24, %zmm14
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm24, %zmm12
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm24, %zmm10
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm24, %zmm6
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm30 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm30 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm27 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm22 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12]
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm15
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm16 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm31 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm15
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm20 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm22 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm21 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm13
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm14 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm21 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13]
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm19 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm25 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm31 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14]
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm17 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm17
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm18
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 192(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 128(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 192(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 192(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 128(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 192(%rdx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm30, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 128(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 192(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 128(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 192(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, 128(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 192(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 192(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 64(%r8)
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, 128(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 128(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 128(%r9)
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, (%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 64(%r9)
-; AVX512F-NEXT:    addq $584, %rsp # imm = 0x248
+; AVX512F-NEXT:    vmovdqa64 %zmm18, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%r9)
+; AVX512F-NEXT:    addq $648, %rsp # imm = 0x288
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride5_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $584, %rsp # imm = 0x248
+; AVX512BW-NEXT:    subq $648, %rsp # imm = 0x288
 ; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm18
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0]
-; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm24, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm24, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm1, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm28, %zmm0
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm21, %zmm12
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm21, %zmm15
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1]
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <2,7,12,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm21, %zmm23
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2]
 ; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm29, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm20
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm21, %zmm29
 ; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u>
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm21, %zmm20
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <12,1,6,u>
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm9, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm24, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm15, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm31, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm11, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm24, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm28, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm31, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm24, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm24, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm0
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm28, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm28, %zmm7
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm25
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm4
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm5
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm16
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm6[0,1,2,3],zmm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm0[0,1,2,3],zmm12[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm10 = zmm3[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm24, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm24, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm24, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm6
 ; AVX512BW-NEXT:    movb $7, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm30 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm30 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm27 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm22 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12]
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm15
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm31 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm22 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm21 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm21 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13]
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm19 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm25 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm31 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14]
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15]
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 64(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 192(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 128(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 192(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 192(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 192(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 192(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%r8)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 128(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 128(%r9)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%r9)
-; AVX512BW-NEXT:    addq $584, %rsp # imm = 0x248
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%r9)
+; AVX512BW-NEXT:    addq $648, %rsp # imm = 0x288
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <160 x i64>, ptr %in.vec, align 64
@@ -5486,292 +5490,291 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i64_stride5_vf64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $3400, %rsp # imm = 0xD48
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm26
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm6
+; AVX512F-NEXT:    subq $3336, %rsp # imm = 0xD08
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm9
 ; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm7
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm15, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm15, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm9
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm10
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0]
+; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm16, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0]
+; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm12, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm12, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm12, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm12, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm6, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm13, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm13, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm13, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2]
+; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm10, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm10, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm10, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm10, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm10, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm11, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm14, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm6, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm11, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm14, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm11, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm14, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm11, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm15, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm14, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm18
-; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm5, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm9, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm1, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm10, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm16, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm11, %zmm2
+; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm16, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,10,15]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm1, %zmm19
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,6,11,u>
+; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm16, %zmm17
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,10,15]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm29 = <1,6,11,u>
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <11,0,5,u>
 ; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u>
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm31
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm26
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm31
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm3, %zmm4
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm23
 ; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm21
 ; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm3, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm21
 ; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm20
 ; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm3, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm20
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm20
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm25
 ; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm3, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm6, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm25
 ; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm12
 ; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm1, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm4, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm8, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm3, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm29, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm6, %zmm27
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm12
 ; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm4, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm8, %zmm23
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm10, %zmm27
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm25, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm25, %zmm9
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm29, %zmm8
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm0, %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm22
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm0, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm7, %zmm24
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm5, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm16, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm16, %zmm9
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7]
+; AVX512F-NEXT:    vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7]
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm25, %zmm0
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm15, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm25, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm15, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm25, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm25, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm25, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm25, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm15, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm15, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm15, %zmm19
 ; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm25, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm15, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm25, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -5779,467 +5782,467 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm24 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm4, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm4, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm26 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm29, %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm29, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm29, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm4, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm29, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm4, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm29, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm4, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm29, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm29, %zmm26
+; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm22 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm3, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm3, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm3, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm3, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm1, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm1, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm3, %zmm31
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm3, %zmm26
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm3, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm3, %zmm31
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm3, %zmm23
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm21 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm3, %zmm21
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm20 {%k1}
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm20
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm22 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm3, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm3, %zmm12
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm3, %zmm25
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm3, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm3, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k1}
 ; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm9
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 448(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 384(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 320(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 256(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 192(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%rsi)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 448(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 256(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 320(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 192(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 384(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 448(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 256(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 320(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 128(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 192(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 384(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 448(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, 256(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 128(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 384(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 448(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 384(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 320(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 256(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 192(%rsi)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 128(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 64(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, (%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 448(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 256(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 320(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 128(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 192(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 64(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 384(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 448(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 256(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm30, 320(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 128(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm26, 192(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 384(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 448(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 256(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 320(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 128(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 192(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm29, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 384(%r8)
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, 384(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 448(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 448(%r9)
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, 256(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 320(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 320(%r9)
 ; AVX512F-NEXT:    vmovdqa64 %zmm20, 128(%r9)
 ; AVX512F-NEXT:    vmovdqa64 %zmm21, 192(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, (%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 64(%r9)
-; AVX512F-NEXT:    addq $3400, %rsp # imm = 0xD48
+; AVX512F-NEXT:    vmovdqa64 %zmm23, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 64(%r9)
+; AVX512F-NEXT:    addq $3336, %rsp # imm = 0xD08
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride5_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $3400, %rsp # imm = 0xD48
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm6
+; AVX512BW-NEXT:    subq $3336, %rsp # imm = 0xD08
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm15, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm10
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0]
+; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm16, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm13, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0]
-; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm10, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm10, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm11, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm11, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm1, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,10,15]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm19
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,6,11,u>
+; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,10,15]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = <1,6,11,u>
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm4, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <11,0,5,u>
 ; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u>
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm4, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm3, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm3, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm4, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm8, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm29, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm8, %zmm23
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm10, %zmm27
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm9
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm8
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm0, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm22
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm0, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm7, %zmm24
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm9
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7]
+; AVX512BW-NEXT:    vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7]
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm25, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm15, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm15, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm25, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm25, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm25, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm15, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm15, %zmm19
 ; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm25, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $7, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
@@ -6247,176 +6250,177 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm4, %zmm25
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm26 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm29, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm29, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm29, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm4, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm29, %zmm26
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm22 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm3, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm3, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm3, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm21 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm20 {%k1}
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm20
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm3, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm3, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k1}
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 448(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 384(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 320(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 256(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 192(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%rsi)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 448(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 256(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 320(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 192(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 384(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 448(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 256(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 320(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 128(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 192(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 384(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 448(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 256(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 384(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 448(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 384(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 320(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 256(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 192(%rsi)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 128(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, (%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 448(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 256(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 320(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 128(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 192(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 256(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 320(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 128(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 192(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, (%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 384(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 256(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 384(%r8)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, 384(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 448(%r9)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, 256(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 320(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 320(%r9)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%r9)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm21, 192(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, (%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%r9)
-; AVX512BW-NEXT:    addq $3400, %rsp # imm = 0xD48
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 64(%r9)
+; AVX512BW-NEXT:    addq $3336, %rsp # imm = 0xD08
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <320 x i64>, ptr %in.vec, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 7d9c056716ceeb..91a70fb000dd65 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -751,223 +751,221 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512F-LABEL: load_i64_stride7_vf8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,7,14,u>
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm0
-; AVX512F-NEXT:    movb $24, %al
-; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm8
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm9
-; AVX512F-NEXT:    movb $-32, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm8
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm10
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7]
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm10
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm7, %zmm8
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
+; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm11
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm11, %zmm8, %zmm8
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm12
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm13
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm11, %zmm7, %zmm7
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm11
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,11]
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm10, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm14 = [5,12]
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm10, %zmm14
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,13]
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm10, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm8, %zmm10
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm9, %zmm8
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    movb $24, %r10b
+; AVX512F-NEXT:    kmovw %r10d, %k2
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
+; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm13, %zmm15
+; AVX512F-NEXT:    movb $-32, %r10b
+; AVX512F-NEXT:    kmovw %r10d, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
+; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm15
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11]
 ; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm7, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm9, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <9,0,7,u>
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm6 {%k1}
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm7, %zmm10
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm9, %zmm7
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm11, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm7, %zmm10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm7
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm10
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm10, %zmm11
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm10
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm12 = [5,12]
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm11
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
+; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
+; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
 ; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm11, %zmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, (%r10)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm11, %zmm15, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm4, %zmm4
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride7_vf8:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm10
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm3
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
+; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm11, %zmm7, %zmm7
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,11]
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm10, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,12]
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm10, %zmm13
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,13]
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm10, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm8, %zmm10
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm8
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
+; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm9, %zmm8
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,7,14,u>
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm0
 ; AVX512BW-NEXT:    movb $24, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k2
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm8
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9]
-; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm9
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
+; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm14
 ; AVX512BW-NEXT:    movb $-32, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm8
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm10 {%k1}
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm11, %zmm8, %zmm8
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm13
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,11]
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <9,0,7,u>
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6 {%k1}
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm7, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm7
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm9, %zmm11, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm7, %zmm10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm7
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm10
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm10, %zmm11
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm10
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [5,12]
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm10
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm11
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm14
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm15
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,11,4,11]
+; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
+; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
+; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm9
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
 ; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm11, %zmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%r10)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm4
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm5
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm14, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r10)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <56 x i64>, ptr %in.vec, align 64
@@ -1724,183 +1722,185 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm30
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm30
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm24
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm26
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm27
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm13
 ; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm27
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm26
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm28
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,7,14,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm7, %zmm29
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
+; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm18, %zmm31
 ; AVX512F-NEXT:    movb $24, %r11b
 ; AVX512F-NEXT:    kmovw %r11d, %k2
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm27[4,5,4,5]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
 ; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm16, %zmm17
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,5,6,9,0,5,6,9]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm18, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm16, %zmm14
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
+; AVX512F-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm17, %zmm14
 ; AVX512F-NEXT:    movb $-32, %r11b
 ; AVX512F-NEXT:    kmovw %r11d, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm29 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm24, %zmm6, %zmm7
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm25[4,5,4,5]
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm0, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm18, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [11,4,11,4,11,4,11,4]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm18, %zmm19
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm18, %zmm14
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm17, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
+; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm24, %zmm17
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
 ; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm20, %zmm19
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
-; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm22, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <9,0,7,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm16, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm0, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm20, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm22, %zmm19
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm24, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm16 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm16 {%k1}
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm20, %zmm19
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,11,0,5,6,11]
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
+; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm19, %zmm17
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
 ; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm19
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [14,0,0,7,14,0,0,7]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm24, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm20, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm21, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm22, %zmm21
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
 ; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm23, %zmm18
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm18, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm0, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm21, %zmm20
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm2, %zmm23
-; AVX512F-NEXT:    vmovdqa 464(%rdi), %xmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm23, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm22, %zmm4
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,12,0,5,6,12]
-; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm4
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [7,0,9,0,7,0,9,0]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm21
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm25, %zmm19
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm19 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm23, %zmm22
+; AVX512F-NEXT:    vpermi2q %zmm26, %zmm3, %zmm25
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 464(%rdi), %xmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm24, %zmm5
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
 ; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm20
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm5
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm5, %zmm20, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm0, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm21, %zmm22
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm25, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
+; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm25, %zmm22
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm22, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm23, %zmm24
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm26, %zmm25
 ; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm4
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm23, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm4, %zmm5
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,13,4,5,6,13]
-; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm23, %zmm5
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm25, %zmm23
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm18, %zmm4
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
+; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm5, %zmm4
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,11]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm3, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm22
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm23, %zmm4
-; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm5
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX512F-NEXT:    vpermi2q %zmm24, %zmm6, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm4, %zmm23
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm3, %zmm10
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm4, %zmm24
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm18
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm4
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512F-NEXT:    vpermi2q %zmm27, %zmm2, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm3, %zmm4
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm12
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,12]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm13, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm4, %zmm31
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm30, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm5
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vpermi2q %zmm24, %zmm6, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm3, %zmm3
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm5, %zmm9
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm8, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm20, %zmm4
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm10, %zmm4
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm12 = [5,12]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm12, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm4
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm10, %zmm20
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm7
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vpermi2q %zmm27, %zmm2, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm5
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
+; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm7, %zmm8
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm10, %zmm8
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11]
 ; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm11, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm11, %zmm9
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,13]
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm12, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm11, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm12, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm10, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm11, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm12, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 64(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, (%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%r10)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, (%r10)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 64(%r10)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, (%r10)
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, 64(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512F-NEXT:    vzeroupper
@@ -1910,183 +1910,185 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm29
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,7,14,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm7, %zmm29
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
+; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm18, %zmm31
 ; AVX512BW-NEXT:    movb $24, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k2
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k2} = zmm10[4,5,4,5],zmm28[4,5,4,5]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm17
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [0,5,6,9,0,5,6,9]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm14
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
+; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm17, %zmm14
 ; AVX512BW-NEXT:    movb $-32, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm6, %zmm7
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm26[4,5,4,5]
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm0, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm18, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [11,4,11,4,11,4,11,4]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm18, %zmm19
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm14
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm17, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k1}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
+; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm24, %zmm17
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm20, %zmm19
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
-; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm22, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <9,0,7,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm0, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm22, %zmm19
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm24, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm16 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm16 {%k1}
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5]
-; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm20, %zmm19
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,11,0,5,6,11]
-; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm19
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm18
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm4, %zmm18, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm0, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm21, %zmm20
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm2, %zmm23
-; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm4, %zmm23, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm22, %zmm4
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,12,0,5,6,12]
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
+; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm19, %zmm17
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
 ; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm4
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [7,0,9,0,7,0,9,0]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm24, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm20, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm21, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm22, %zmm21
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm25, %xmm25
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm25, %zmm20, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm0, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm21, %zmm22
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm26, %zmm23
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm4
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm4, %zmm23, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm21
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm25, %zmm19
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm19 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm22
+; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm4, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm4, %zmm22
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [4,5,6,13,4,5,6,13]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm24, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm23, %zmm22
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm5
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,11]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm22, %zmm22
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm4
-; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm5
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm6, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm23
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm5
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm25
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,12]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm31, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm8
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm6, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm4, %zmm3
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm4, %zmm9
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm27
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti32x4 $1, %ymm27, %xmm27
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm27, %zmm22, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm24
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm26, %zmm25
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm5
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm25, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm23 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm18, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm5
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,11]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm10
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm24
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm18
+; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm2, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm20, %zmm10
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm12, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm18
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [5,12]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm10, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm20
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm10
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm2, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
+; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm8
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11]
 ; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm11, %zmm9
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,13]
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm5, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm10, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm12, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, (%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%r10)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%r10)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%r10)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%r10)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
@@ -3680,892 +3682,960 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i64_stride7_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $2216, %rsp # imm = 0x8A8
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovaps 1024(%rdi), %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    subq $2728, %rsp # imm = 0xAA8
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm12
 ; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm26
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm3, %zmm7
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm13, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm20, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm20, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm25
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0]
-; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm28, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm28, %zmm16
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0]
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm26
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm19
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
 ; AVX512F-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm30, %zmm14
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm30, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm26
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm24
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm3, %zmm13
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm20, %zmm29
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm3, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm31
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm28, %zmm22
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm3, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm30, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm21, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm1
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm19
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm19
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm19
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm30, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm20
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm16
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm28
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm3
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [10,3,10,3,10,3,10,3]
+; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm3, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm30, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm30
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm26
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm21
-; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    movb $24, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm21, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5]
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm10, %zmm26
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
-; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm16, %zmm27
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11]
-; AVX512F-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm20, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm17
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm10, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm16, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm20, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm21, %zmm12
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm8
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm8, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm15, %zmm3, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm24
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm15, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm20, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm8, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm20, %zmm2
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
+; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
+; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm22, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm22, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm30, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm30, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm9, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm9, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm30, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 912(%rdi), %xmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 464(%rdi), %xmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm0, %zmm26, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm0, %zmm23, %zmm26
-; AVX512F-NEXT:    vmovdqa 1360(%rdi), %xmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm0, %zmm10, %zmm20
-; AVX512F-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,11]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm10, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm10, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 1472(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm8, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm28, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm8, %zmm12
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm30, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm10
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm10, %zmm8
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm9, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm11, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm26
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm11, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm9, %zmm23
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm9, %zmm27
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm10, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm22, %zmm29
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm10, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm5, %zmm22
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm30, %zmm25
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm10, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm2, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm0
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm17
+; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm29
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm22
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm25
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm8
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm30
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm24
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
+; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm19
+; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    movb $24, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm20
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11]
+; AVX512F-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm21, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm4, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm21, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm16
+; AVX512F-NEXT:    vpermi2q %zmm11, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm17
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm11, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm21, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm21, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 912(%rdi), %xmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm11, %zmm16, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 464(%rdi), %xmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm11, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm15, %zmm21
+; AVX512F-NEXT:    vmovdqa 1360(%rdi), %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm0, %zmm16
+; AVX512F-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,11]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm11
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
+; AVX512F-NEXT:    vmovdqa64 %ymm1, %ymm25
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm15
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm11
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512F-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
+; AVX512F-NEXT:    vmovdqa 1472(%rdi), %ymm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm28, %zmm8, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm0
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqa64 %ymm0, %ymm23
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <9,0,7,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,12]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,13]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm7, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm4, %zmm0
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} xmm30 = [6,13]
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm30, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm30, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa 960(%rdi), %ymm15
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti32x4 $1, %ymm15, %xmm19
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm9
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm8, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm7, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm7, %zmm11
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm12
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm28, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm30, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm30, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $-32, %al
 ; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm18 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm21 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm5 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm29 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm26 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k2}
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm19, %zmm24, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm4 {%k2}
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm27, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %ymm23, %ymm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm9, %zmm25, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm19
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqa 1408(%rdi), %ymm7
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm7, %zmm16, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k2}
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm10
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm2
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm15, %zmm16, %zmm15
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm16, %zmm14
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm16, %zmm8
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm16, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 192(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 128(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, 192(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k2}
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm17, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm17 {%k2}
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm12, %zmm20, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k2}
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm14
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm14, %zmm19, %zmm9
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k2}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm14
+; AVX512F-NEXT:    vmovdqa64 %ymm25, %ymm2
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa 1408(%rdi), %ymm15
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm4, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k2}
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm2
+; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm14
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm10, %zmm6
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 192(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 128(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm30, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 192(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm26, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 128(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 192(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, (%rcx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm29, 64(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 128(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 192(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 192(%r8)
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 128(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 128(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm4, 192(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm4, (%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm4, 64(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm4, 128(%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 128(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512F-NEXT:    vmovaps %zmm15, 64(%rax)
-; AVX512F-NEXT:    addq $2216, %rsp # imm = 0x8A8
+; AVX512F-NEXT:    vmovaps %zmm7, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 192(%rax)
+; AVX512F-NEXT:    vmovaps %zmm8, (%rax)
+; AVX512F-NEXT:    vmovaps %zmm11, 64(%rax)
+; AVX512F-NEXT:    addq $2728, %rsp # imm = 0xAA8
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride7_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $2152, %rsp # imm = 0x868
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm12
+; AVX512BW-NEXT:    subq $2760, %rsp # imm = 0xAC8
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm26
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovaps 576(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm31, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0]
+; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm18
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0]
-; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm28, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm25
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0]
-; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm29, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm30
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm29, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm31, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm31
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm5, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm27
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm28, %zmm20
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm5, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm29, %zmm12
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm21, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm23, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm18, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm18, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm25, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm16
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm18, %zmm29
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm7, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm11, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm9, %zmm22
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm23, %zmm29
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm11, %zmm31
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm15
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm9, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm1, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm0
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm23
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm29
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm23
 ; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm25
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm28
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm30
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm29
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm8
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm25
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $24, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm21, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5]
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm13
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0]
-; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm17, %zmm26
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11]
-; AVX512BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm21, %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm19, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm21, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm8
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm8, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm27
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,11,4,11]
+; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm6, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm25
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm3, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5]
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm19, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 912(%rdi), %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm13, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm23, %zmm23
-; AVX512BW-NEXT:    vmovdqa 1360(%rdi), %xmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm11, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm19
+; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm20
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm12, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 912(%rdi), %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm19, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm3, %zmm18, %zmm19
+; AVX512BW-NEXT:    vmovdqa 1360(%rdi), %xmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm3, %zmm0, %zmm18
 ; AVX512BW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,11]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,11]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 1472(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm8, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm28, %zmm0
+; AVX512BW-NEXT:    vmovdqa 1472(%rdi), %ymm2
+; AVX512BW-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,12]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm30 = [6,13]
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm7, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm2, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm30, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %ymm24
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <9,0,7,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,12]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,13]
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm9
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm10
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm5
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm12, %zmm28
 ; AVX512BW-NEXT:    movb $-32, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm12 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm19 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm22 {%k2}
-; AVX512BW-NEXT:    vmovdqa 960(%rdi), %ymm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm5 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512BW-NEXT:    vmovdqa 960(%rdi), %ymm1
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm20, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1 {%k2}
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm6
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm25, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm7
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm7, %zmm26, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k2}
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm9, %zmm30, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %ymm20
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm20, %zmm17, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %ymm20
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm5
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm27, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm17
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm17, %zmm23, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %ymm17
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm17, %zmm10, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %ymm17
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm13
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm29, %zmm1
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm15, %zmm14
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm12
+; AVX512BW-NEXT:    vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
 ; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 192(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 128(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 64(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 192(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 128(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 192(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm16, %zmm7
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 192(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 192(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 192(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 128(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 192(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, (%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 64(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 128(%r9)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rax)
-; AVX512BW-NEXT:    addq $2152, %rsp # imm = 0x868
+; AVX512BW-NEXT:    vmovaps %zmm8, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rax)
+; AVX512BW-NEXT:    vmovaps %zmm9, (%rax)
+; AVX512BW-NEXT:    vmovaps %zmm11, 64(%rax)
+; AVX512BW-NEXT:    addq $2760, %rsp # imm = 0xAC8
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <224 x i64>, ptr %in.vec, align 64
@@ -8015,45 +8085,46 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i64_stride7_vf64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512F-NEXT:    vmovdqa64 3328(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    subq $7624, %rsp # imm = 0x1DC8
+; AVX512F-NEXT:    vmovdqa64 3328(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 3008(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 2944(%rdi), %zmm22
+; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2944(%rdi), %zmm20
 ; AVX512F-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 2816(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20
-; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm17
+; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
@@ -8061,972 +8132,997 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512F-NEXT:    vmovdqa 464(%rdi), %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm6, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa 1360(%rdi), %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa 912(%rdi), %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa 2256(%rdi), %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa 1808(%rdi), %xmm2
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm17, %zmm3, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm16, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqa 3152(%rdi), %xmm1
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
 ; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 2816(%rdi), %ymm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,11]
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm5, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
+; AVX512F-NEXT:    vmovdqa 2816(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,11]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm9, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
+; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
+; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm4, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm2
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm9, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm16
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm11, %zmm3
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm3
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512F-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm9, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm23
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm24
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
+; AVX512F-NEXT:    vmovdqa 1472(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm9, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm19
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm17
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm4, %zmm8
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm9, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm30
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm11, %zmm10
+; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm4, %zmm10
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
+; AVX512F-NEXT:    vmovdqa 2368(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm9, %zmm10
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm11, %zmm14
+; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm14, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
+; AVX512F-NEXT:    vmovdqa 1920(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm22
+; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm9, %zmm15
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm26
+; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm11, %zmm15
+; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm4, %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
+; AVX512F-NEXT:    vmovdqa 3264(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
+; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %zmm14
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm14, %zmm9
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm11, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm11, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3456(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 3392(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm0, %zmm11
+; AVX512F-NEXT:    vmovdqa64 3520(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm11
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 2880(%rdi), %ymm4
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm11 = [5,12]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm11, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm16
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm16, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm10
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm11, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm10
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm10
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 2432(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 1984(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm11, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm4, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 3328(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm14, %zmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm18
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm22
+; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm25
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm25
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm9, %zmm0
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
+; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm23, %zmm12
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
+; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
+; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm1, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm28
+; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm23, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm22
 ; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm23, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm30
+; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm23, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm24, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm8, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm30
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm16, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 1472(%rdi), %ymm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm5, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm4, %zmm6
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm6
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm5, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm4, %zmm8
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm23, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm8, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm28
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm23, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm6, %zmm23
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm8
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 2368(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm24, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm16, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm8
-; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm9, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa 1920(%rdi), %ymm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512F-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm5, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm11
-; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm16, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm8
-; AVX512F-NEXT:    vmovdqa 3264(%rdi), %ymm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7]
-; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm6, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 3456(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 3392(%rdi), %zmm31
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm31, %zmm4
-; AVX512F-NEXT:    vmovdqa64 3520(%rdi), %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm4, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 2880(%rdi), %ymm4
-; AVX512F-NEXT:    vmovdqa64 %ymm16, %ymm0
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm4, %zmm16
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,14,4,5,6,14]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm9
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm9
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm4, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm8, %zmm9
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm9, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm4, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm8, %zmm7
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm2
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm4, %zmm3
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm2
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm4, %zmm3
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 2432(%rdi), %ymm2
-; AVX512F-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm4, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm8, %zmm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 1984(%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa64 %ymm29, %ymm3
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm4, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm8, %zmm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 3328(%rdi), %ymm2
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm6, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vpermi2q %zmm31, %zmm5, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm8, %zmm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm16, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [10,3,10,3,10,3,10,3]
-; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm26, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [11,4,11,4,11,4,11,4]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm7, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm31, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm8, %zmm25
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm12, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm12, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm3, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm7, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm12, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm30
-; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm26, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm3, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm7, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm12, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm26, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm27
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm26, %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm31, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm8
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm31, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm7, %zmm28
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm31, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm31, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm16, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm31
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm26
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm30
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm20
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm20
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm30
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm28
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm20
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm23
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm18
 ; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm28
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm22
-; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm23
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm19
+; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm21
 ; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm12
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm22
+; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm24
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm29
+; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm16
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm18
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm19
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm21
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm22
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm19
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm31
+; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $24, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm23 = <0,7,14,u>
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm17
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm13[4,5,4,5],zmm29[4,5,4,5]
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0]
-; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm29, %zmm0
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm20
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11]
+; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11]
-; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm6, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm23, %zmm24
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5]
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm31
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm17[4,5,4,5]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm29, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm23, %zmm25
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm29, %zmm0
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm23, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm23, %zmm17
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm11
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm23, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm13, %zmm10, %zmm23
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm2[4,5,4,5],zmm4[4,5,4,5]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm26
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm29, %zmm22
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm4, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm15, %zmm26
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <9,0,7,u>
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm16
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,13]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm3, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm3, %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm10, %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm13
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    movb $-32, %al
-; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm23 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm4 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm6 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 2752(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm22, %zmm28
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 1408(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm0, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 960(%rdi), %ymm14
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm14, %xmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm14, %zmm0, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 2304(%rdi), %ymm1
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 1856(%rdi), %ymm13
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa 2752(%rdi), %ymm3
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm3, %zmm26, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm1
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm1, %zmm20, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm29, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqa 1408(%rdi), %ymm4
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm4, %zmm17, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa 960(%rdi), %ymm8
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm8, %zmm9, %zmm8
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
+; AVX512F-NEXT:    vmovdqa 2304(%rdi), %ymm10
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm10, %xmm10
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm10, %zmm9, %zmm10
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa 1856(%rdi), %ymm12
+; AVX512F-NEXT:    vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
+; AVX512F-NEXT:    vextracti128 $1, %ymm12, %xmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm12, %zmm9, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
+; AVX512F-NEXT:    vmovdqa 3200(%rdi), %ymm13
 ; AVX512F-NEXT:    vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm13, %xmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm13, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 3200(%rdi), %ymm15
-; AVX512F-NEXT:    vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512F-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512F-NEXT:    vinserti32x4 $0, %xmm15, %zmm29, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 448(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 384(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 320(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 256(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, 192(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 128(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 448(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 320(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 128(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 384(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 320(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 128(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 192(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 448(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 256(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 320(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 128(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 192(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 384(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512F-NEXT:    vinserti32x4 $0, %xmm13, %zmm15, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm9 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 448(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, 384(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, 320(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, 256(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, 192(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, 128(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, 64(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm13, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 448(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, 320(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 128(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 192(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 64(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 384(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 448(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 256(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 320(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm30, 128(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 192(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 384(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 448(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 256(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 320(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 64(%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 384(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 448(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 256(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 320(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 128(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 192(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, (%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 64(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 384(%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 448(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 256(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 320(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 128(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm2, 64(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm3, 384(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-NEXT:    vmovaps %zmm11, 384(%rax)
+; AVX512F-NEXT:    vmovaps %zmm4, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 256(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 320(%rax)
 ; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, 192(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    addq $6728, %rsp # imm = 0x1A48
+; AVX512F-NEXT:    addq $7624, %rsp # imm = 0x1DC8
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride7_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $6664, %rsp # imm = 0x1A08
-; AVX512BW-NEXT:    vmovdqa64 3328(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm4
+; AVX512BW-NEXT:    subq $7624, %rsp # imm = 0x1DC8
+; AVX512BW-NEXT:    vmovdqa64 3328(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm10
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
@@ -9034,867 +9130,891 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 464(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm1
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 1360(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 912(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 2256(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 1808(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm17, %zmm4, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqa 3152(%rdi), %xmm1
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
 ; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa 2816(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,11]
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0]
+; AVX512BW-NEXT:    vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 2816(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,11]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
 ; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm9, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm9, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm31
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 1472(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm26
+; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm0, %ymm2
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm2
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 2368(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 1920(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm21
+; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 1536(%rdi), %ymm0, %ymm2
-; AVX512BW-NEXT:    vmovdqa 1472(%rdi), %ymm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 3264(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %zmm11
+; AVX512BW-NEXT:    vpermi2q %zmm17, %zmm11, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm10, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 1088(%rdi), %ymm0, %ymm9
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm8, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm10, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 2432(%rdi), %ymm0, %ymm9
-; AVX512BW-NEXT:    vmovdqa 2368(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm8, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm10, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm10, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 1984(%rdi), %ymm0, %ymm9
-; AVX512BW-NEXT:    vmovdqa 1920(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm4, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti128 $1, 3328(%rdi), %ymm0, %ymm9
-; AVX512BW-NEXT:    vmovdqa 3264(%rdi), %ymm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm10, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 3456(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 3392(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 3520(%rdi), %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 3456(%rdi), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 3392(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm2, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 3520(%rdi), %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm10
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2880(%rdi), %ymm4
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqa 2880(%rdi), %ymm0
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,12]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
 ; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm15, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm4, %zmm12
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm9
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm4, %zmm12
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm8
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm8
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm12
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm9
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm4, %zmm12
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm9
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm7
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm12
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 2432(%rdi), %ymm9
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm9, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 1984(%rdi), %ymm2
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm10, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 3328(%rdi), %ymm2
+; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm17, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm5, %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2432(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm20
+; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm24, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm4
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm25
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
+; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm27, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 1984(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm5, %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
+; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 3328(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm10, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm19, %zmm21
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4]
-; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm20, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [12,5,12,5,12,5,12,5]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm18, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm27, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm19, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm29, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm20, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm19, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm20, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm19, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm18, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm11, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm28
 ; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm19, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm20, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm18, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm30
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm24
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm20, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm20
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm18, %zmm29
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm22
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm31, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm26
 ; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm31, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm29
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm27, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm31, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm23, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm27, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm31, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm12, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm19
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm20
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm23, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm29
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm8, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm27, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm12, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm11
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm29, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm31, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm29, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm18
+; AVX512BW-NEXT:    vpermi2q %zmm24, %zmm29, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm26
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm22
+; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm23
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm22
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm27
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm20
+; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm31
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm12
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm20
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm26
 ; AVX512BW-NEXT:    movb $24, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,7,14,u>
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm16, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm31[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0]
-; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
+; AVX512BW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm16, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm16, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm23[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm16, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm28
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm16, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm30, %zmm28
+; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm6, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,13]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm15, %zmm16
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5]
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm21
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm26, %zmm21
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <9,0,7,u>
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm12
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,13]
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm28, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm8
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm30, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm17
+; AVX512BW-NEXT:    vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm29
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $-32, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm24
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm22
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm16 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm31 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm30 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
-; AVX512BW-NEXT:    vmovdqa 2752(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm21, %zmm17
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm1, %zmm18, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm1 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm18
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm18, %zmm19, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %ymm19
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm19, %xmm19
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm19, %zmm20, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm19 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %ymm20
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm20, %zmm21, %zmm20
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %ymm22
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm22, %zmm21, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %ymm24
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm24, %xmm24
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm24, %zmm21, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm21 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %ymm24
-; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
-; AVX512BW-NEXT:    vextracti32x4 $1, %ymm24, %xmm24
-; AVX512BW-NEXT:    vinserti32x4 $0, %xmm24, %zmm26, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm24 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 448(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 384(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 320(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 256(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 192(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 128(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm10, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm10, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k2}
+; AVX512BW-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm28, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm0
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k2}
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm3
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k2}
+; AVX512BW-NEXT:    vmovdqa 1408(%rdi), %ymm4
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k2}
+; AVX512BW-NEXT:    vmovdqa 960(%rdi), %ymm5
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k2}
+; AVX512BW-NEXT:    vmovdqa 2304(%rdi), %ymm6
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm6, %zmm7, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
+; AVX512BW-NEXT:    vmovdqa 1856(%rdi), %ymm7
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm7, %zmm8, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
+; AVX512BW-NEXT:    vmovdqa 3200(%rdi), %ymm8
+; AVX512BW-NEXT:    vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm8, %xmm8
+; AVX512BW-NEXT:    vinserti32x4 $0, %xmm8, %zmm30, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm8 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 448(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm12, 384(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm12, 320(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm12, 256(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 192(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm12, 128(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm12, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 448(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 384(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 448(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 320(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 128(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 192(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 64(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, 384(%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 448(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 256(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 320(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 192(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 384(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 192(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, (%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm2, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 448(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 256(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 320(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 192(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 384(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 448(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 256(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 384(%r8)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 448(%r9)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -9931,21 +10051,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 448(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 320(%rax)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    addq $6664, %rsp # imm = 0x1A08
+; AVX512BW-NEXT:    addq $7624, %rsp # imm = 0x1DC8
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <448 x i64>, ptr %in.vec, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index 4738b2344255b5..638f7f685319bd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -815,229 +815,225 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512F-LABEL: load_i64_stride8_vf8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm18
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm11
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm12, %zmm3, %zmm5
-; AVX512F-NEXT:    movb $-64, %al
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm8, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm12, %zmm3, %zmm8
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm9, %zmm13
-; AVX512F-NEXT:    vpermi2q %zmm12, %zmm3, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm16
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm9
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm15
-; AVX512F-NEXT:    vpermi2q %zmm12, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm15, %zmm17
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm19, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7]
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm14, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm1, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm8, %zmm9
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
+; AVX512F-NEXT:    movb $-64, %bl
+; AVX512F-NEXT:    kmovw %ebx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm10, %ymm10
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm9, %ymm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm16
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
 ; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm11, %zmm12
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm10, %zmm11
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm14
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm10
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm8, %zmm13
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm8
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
+; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512F-NEXT:    vpermt2q %zmm7, %zmm11, %zmm12
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm11
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm1, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm11, %ymm11
-; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm6, %ymm6
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm5, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3]
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm16, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm17, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm19, %zmm2
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rcx)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, (%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%r10)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rdi)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm13, %zmm11
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm12
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm13, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm15
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm0, %zmm13
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm13, %zmm13
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm14, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, (%r11)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, (%r10)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride8_vf8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm11
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
-; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm3, %zmm5
-; AVX512BW-NEXT:    movb $-64, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm3, %zmm8
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm13
-; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm13
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm16
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm9
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm15
-; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm15, %zmm17
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm19, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7]
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm1, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
+; AVX512BW-NEXT:    movb $-64, %bl
+; AVX512BW-NEXT:    kmovd %ebx, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm10, %ymm10
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm9, %ymm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm16
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm12
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm11, %zmm9
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm11
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm11
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm14
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm10
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm13
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm8
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm11, %zmm12
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm11
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm1, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm6
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm11
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm11, %ymm11
-; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm6, %ymm6
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3]
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm16, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm17, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm2
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rcx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%r10)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rdi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm13, %zmm11
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm12
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm13
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm5[0],zmm4[0],zmm5[2],zmm4[2],zmm5[4],zmm4[4],zmm5[6],zmm4[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm13, %zmm13
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm5[1],zmm4[1],zmm5[3],zmm4[3],zmm5[5],zmm4[5],zmm5[7],zmm4[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%r11)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%r10)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <64 x i64>, ptr %in.vec, align 64
@@ -1926,221 +1922,222 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-LABEL: load_i64_stride8_vf16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm18
-; AVX512F-NEXT:    vmovaps 640(%rdi), %zmm0
+; AVX512F-NEXT:    vmovaps 576(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm31
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm24
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm13
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm19, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm19, %zmm15
+; AVX512F-NEXT:    vmovaps 512(%rdi), %zmm0
+; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512F-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm30
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm15
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm29, %zmm18
 ; AVX512F-NEXT:    movb $-64, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm16
-; AVX512F-NEXT:    vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21
-; AVX512F-NEXT:    vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %ymm22
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm24
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm25
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm19, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm28, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm5
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm22
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm19, %zmm11
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm19 {%k1}
+; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm11
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm26
 ; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm27
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm30 {%k1}
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm26
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3]
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm15
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm23, %zmm29
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm30, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm5
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm6
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm5, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm31, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm8, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm27
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm7, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm30
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7]
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm30, %zmm28, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm25
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm27, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm27, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm26, %zmm1
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
 ; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm24
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm23, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm6, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm28, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm21, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm27, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm17
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm27, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm27, %zmm9
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm30, %zmm28, %zmm27
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm9 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm26, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm26, %zmm9
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm16
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm2, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm5, %zmm1
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm28
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm31, %zmm10, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm8, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm13, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm5, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm15
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm31, %zmm10, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm2, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm9, %zmm2
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm11
-; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm11, %ymm11
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm0, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm10
-; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm10, %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm19, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm29 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm29, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm29, %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm26, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm26, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm21, %zmm13
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm21
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm14, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm2, %ymm2
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k1}
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm1
+; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm6
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm6, %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm29, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm23 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm1, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm24, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, (%rdx)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm21 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm26, %zmm2
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, (%rcx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 64(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 64(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, (%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, (%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512F-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2148,221 +2145,222 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-LABEL: load_i64_stride8_vf16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    subq $264, %rsp # imm = 0x108
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovaps 640(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovaps 576(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm13
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm19, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm19, %zmm15
+; AVX512BW-NEXT:    vmovaps 512(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm15
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm18
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm16
-; AVX512BW-NEXT:    vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21
-; AVX512BW-NEXT:    vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm19, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm21 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm22
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm24
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm25
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm19, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm28, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm5
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm22
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm19, %zmm11
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19 {%k1}
+; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm11
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm26
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm27
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm19
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm30 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm26
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm15
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm23, %zmm29
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm30, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm13 {%k1}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm23
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm8, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm27
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm24
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm30
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm28, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm14[0],zmm10[0],zmm14[2],zmm10[2],zmm14[4],zmm10[4],zmm14[6],zmm10[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm25
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm5[1],zmm16[1],zmm5[3],zmm16[3],zmm5[5],zmm16[5],zmm5[7],zmm16[7]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm24
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm23, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm28, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm15[0],zmm3[0],zmm15[2],zmm3[2],zmm15[4],zmm3[4],zmm15[6],zmm3[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm15[1],zmm3[1],zmm15[3],zmm3[3],zmm15[5],zmm3[5],zmm15[7],zmm3[7]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm21, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm27, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm28, %zmm27
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm14[1],zmm10[1],zmm14[3],zmm10[3],zmm14[5],zmm10[5],zmm14[7],zmm10[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm9 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm26, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm26, %zmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm16
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm2, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm1
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm28
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm10, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm8, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm13, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm15
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm31, %zmm10, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm9, %zmm2
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm11
-; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm11, %ymm11
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm0, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm10, %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm19, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm29, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm29, %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm26, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm26, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm21, %zmm13
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm21
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm14, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm2, %ymm2
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k1}
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm28[0],zmm30[0],zmm28[2],zmm30[2],zmm28[4],zmm30[4],zmm28[6],zmm30[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm28[1],zmm30[1],zmm28[3],zmm30[3],zmm28[5],zmm30[5],zmm28[7],zmm30[7]
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm6
+; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm6, %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm29, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm24, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm28, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm21 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm26, %zmm2
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rcx)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%r9)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512BW-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -4240,1031 +4238,1043 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i64_stride8_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $2632, %rsp # imm = 0xA48
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm28
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm21
+; AVX512F-NEXT:    subq $3208, %rsp # imm = 0xC88
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm19
 ; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm18
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm22
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm31
 ; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm30
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512F-NEXT:    movb $-64, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %ymm29
-; AVX512F-NEXT:    vmovdqa 1152(%rdi), %ymm13
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2]
-; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm6
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %ymm22
+; AVX512F-NEXT:    vmovdqa 1152(%rdi), %ymm10
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm9
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm8
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm12
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm7
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm13
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm17
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm21
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm5
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm25
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm26
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm27
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3]
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm10, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm11 {%k1}
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm4
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm29
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm11
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm23, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa 1728(%rdi), %ymm10
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %ymm28
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2]
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm31
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm26
+; AVX512F-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa 1728(%rdi), %ymm2
+; AVX512F-NEXT:    vmovdqa 1664(%rdi), %ymm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm30
+; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3]
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm5
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm8
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm9
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm23, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm7
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm13
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm28
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm6
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm25
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm26
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm5, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm31, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm16, %zmm15, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm24, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm2, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm21, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm23, %zmm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm17, %zmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm21, %zmm27
+; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm11, %zmm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6]
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm31, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm24, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm5, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm5, %zmm3
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm9, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm17, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm31, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm24, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm4, %zmm16
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7]
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm11, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm31, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm24, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm9, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm31, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm24, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm4, %zmm19
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm11, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm11 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm8
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm21, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm16, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm23, %zmm27
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm17, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm14 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm20, %zmm5, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm31, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm24, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm21, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm31, %zmm25
+; AVX512F-NEXT:    vpermi2q %zmm12, %zmm22, %zmm21
+; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm16, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm12, %zmm22, %zmm16
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm16
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm23, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm23, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm29
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm24, %zmm28
-; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm4, %zmm23
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7]
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm8
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm23, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm7, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm0, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm0, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm11, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm15 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm23, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm9, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm16, %zmm22, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm17, %zmm22
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm17, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm17, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm17, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm17, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm17, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm17, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm17, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm17, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm25 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm25, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm5, %ymm5
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm6
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm6, %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm7, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %xmm13
+; AVX512F-NEXT:    vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %xmm18
+; AVX512F-NEXT:    vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm19, %zmm7, %zmm19
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %xmm21
+; AVX512F-NEXT:    vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %xmm25
+; AVX512F-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm26, %zmm7, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm7, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm15, %zmm6, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm2, %zmm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm8
-; AVX512F-NEXT:    vpermi2q %zmm15, %zmm6, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm21
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm16, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm4, %ymm4
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm5, %ymm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm8
-; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm8, %ymm8
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm10
-; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm10, %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm18, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa 1088(%rdi), %xmm12
-; AVX512F-NEXT:    vinserti128 $1, 1216(%rdi), %ymm12, %ymm12
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %xmm16
-; AVX512F-NEXT:    vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm22, %zmm13, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm31 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %xmm25
-; AVX512F-NEXT:    vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %xmm26
-; AVX512F-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm31, %zmm27
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm13, %zmm12
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm20 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm17 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm17, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm24 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm24, %zmm5
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm1
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm23, %zmm2
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm10, %zmm13, %zmm10
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm6, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm23, %zmm6
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm9, %zmm13
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm3, %zmm14
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm22, %zmm9
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm3, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 192(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 128(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 64(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 128(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 192(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, (%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 128(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 192(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, (%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 128(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm3, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 128(%r9)
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 192(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 128(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 192(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm3, (%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm3, 64(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm3, 128(%rax)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 128(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, (%rax)
-; AVX512F-NEXT:    vmovaps %zmm10, 64(%rax)
-; AVX512F-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512F-NEXT:    vmovaps %zmm7, 64(%rax)
+; AVX512F-NEXT:    addq $3208, %rsp # imm = 0xC88
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride8_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $2632, %rsp # imm = 0xA48
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm21
+; AVX512BW-NEXT:    subq $3208, %rsp # imm = 0xC88
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm31
 ; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm23
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm29
-; AVX512BW-NEXT:    vmovdqa 1152(%rdi), %ymm13
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2]
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm6
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm22
+; AVX512BW-NEXT:    vmovdqa 1152(%rdi), %ymm10
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2]
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm9
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm8
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm12
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm7
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm13
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2]
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm17
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm21
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm5
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm26
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm27
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm25
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm29
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm11
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm23, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1728(%rdi), %ymm10
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %ymm28
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2]
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm31
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm26
+; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1728(%rdi), %ymm2
+; AVX512BW-NEXT:    vmovdqa 1664(%rdi), %ymm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2]
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm30
+; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3]
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3]
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm8
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm9
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm23, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm18 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm18, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vmovdqu64 %zmm23, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm7
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3]
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm6
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm13
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm26, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm13
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm28
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm26
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm5, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm31, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm24, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm23, %zmm0
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm17, %zmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6]
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm21, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6]
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm31, %zmm25
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm24, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm3
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm30
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7]
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm17, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm31, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm24, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm16
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7]
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm11, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm9, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %ymm1, %ymm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm24, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm19
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm11, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm11 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm11
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm21, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm23, %zmm27
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7]
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm17, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm14 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm5, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm14, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm21, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm25
+; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm22, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm16, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm12, %zmm22, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm23, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm23, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm29
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm28
-; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm23
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7]
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm0, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm15 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm9, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm22, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm17, %zmm22
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7]
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm17, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm17, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm17, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm17, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm17, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm17, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm17, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm17, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm25 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm25, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm5, %ymm5
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm6
+; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm6, %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm7, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %xmm18
+; AVX512BW-NEXT:    vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm19, %zmm7, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %xmm21
+; AVX512BW-NEXT:    vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %xmm25
+; AVX512BW-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm26, %zmm7, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm7, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm6, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm5
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm6, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm21
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm16, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm4, %ymm4
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm5, %ymm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm8
-; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm8, %ymm8
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm10, %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm18, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, 1216(%rdi), %ymm12, %ymm12
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %xmm16
-; AVX512BW-NEXT:    vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm22, %zmm13, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm31 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %xmm25
-; AVX512BW-NEXT:    vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %xmm26
-; AVX512BW-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm31, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm13, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm20, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm17, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm24 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm24, %zmm5
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm1
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm10, %zmm13, %zmm10
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm6, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm23, %zmm6
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm7, %zmm8, %zmm7
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm9, %zmm13
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm22, %zmm9
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm3, %zmm14
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm3, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 192(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 128(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 128(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 192(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, (%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 64(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 128(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 192(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, (%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm3, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 128(%r9)
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm3, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 192(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 128(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r9)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm3, (%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm3, 64(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm3, 128(%rax)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 128(%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%rax)
-; AVX512BW-NEXT:    vmovaps %zmm10, 64(%rax)
-; AVX512BW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512BW-NEXT:    vmovaps %zmm7, 64(%rax)
+; AVX512BW-NEXT:    addq $3208, %rsp # imm = 0xC88
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <256 x i64>, ptr %in.vec, align 64
@@ -9123,42 +9133,41 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512F-LABEL: load_i64_stride8_vf64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $6600, %rsp # imm = 0x19C8
-; AVX512F-NEXT:    vmovdqa64 3392(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 3328(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3456(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512F-NEXT:    subq $6664, %rsp # imm = 0x1A08
+; AVX512F-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3328(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqa64 3520(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 3456(%rdi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $-64, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm29
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9167,62 +9176,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
 ; AVX512F-NEXT:    vmovdqa 3136(%rdi), %ymm3
 ; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 3072(%rdi), %ymm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2]
+; AVX512F-NEXT:    vmovdqa 3072(%rdi), %ymm7
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm26
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2]
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %ymm20
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm22
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm19
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm30
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2]
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm21
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm2, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa 1728(%rdi), %ymm3
 ; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-NEXT:    vmovdqa 1664(%rdi), %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm21
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2]
+; AVX512F-NEXT:    vmovdqa 1600(%rdi), %ymm4
+; AVX512F-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -9233,61 +9240,64 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 1216(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 1216(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa 1152(%rdi), %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %ymm25
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %ymm24
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %ymm22
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm4
+; AVX512F-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 3008(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2944(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 2880(%rdi), %zmm24
+; AVX512F-NEXT:    vmovdqa64 2816(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 2752(%rdi), %ymm3
-; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqa 2688(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %ymm31
-; AVX512F-NEXT:    vmovdqa 2560(%rdi), %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2]
+; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %ymm27
+; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %ymm26
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
+; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %ymm30
+; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %ymm18
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %ymm28
-; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %ymm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2]
-; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %ymm27
-; AVX512F-NEXT:    vmovdqa 2048(%rdi), %ymm6
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vmovdqa 2240(%rdi), %ymm12
+; AVX512F-NEXT:    vmovdqa 2176(%rdi), %ymm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
+; AVX512F-NEXT:    vmovdqa 2112(%rdi), %ymm10
+; AVX512F-NEXT:    vmovdqa 2048(%rdi), %ymm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 4032(%rdi), %zmm1
@@ -9295,807 +9305,801 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 3968(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 3904(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3840(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm4, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa 3776(%rdi), %ymm12
-; AVX512F-NEXT:    vmovdqa 3712(%rdi), %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2]
-; AVX512F-NEXT:    vmovdqa 3648(%rdi), %ymm4
-; AVX512F-NEXT:    vmovdqa 3584(%rdi), %ymm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %ymm17
+; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %ymm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
+; AVX512F-NEXT:    vmovdqa 3648(%rdi), %ymm1
+; AVX512F-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
 ; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm14
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm2, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm6 {%k1}
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm25, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm31
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm6
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm19, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm11, %zmm12, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %zmm10
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm23
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm26
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm16
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm30
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm25
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 3648(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3584(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 3648(%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqa64 3584(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm18, %zmm29, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm10
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm29
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm16
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm3, %zmm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm9, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm11
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm8
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm15, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm3, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm5, %zmm19
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm17
+; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm5, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm12, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm12, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm23, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm14, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm23, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm5, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm10
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm4, %zmm19
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7]
 ; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm14, %zmm30
-; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm23, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm12, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm2, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm14, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm23, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm12, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm20
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm14, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm14, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm4, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm23, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm14, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm14, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm23, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm14, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm23, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm5, %zmm16
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm14, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm14, %zmm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm12, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm0, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm23, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm23, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm18, %zmm0, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm4, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm4, %zmm30
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm12, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm5, %zmm1
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm12, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm12, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm2, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm30
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm5, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm12, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm12, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm5, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm12, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm12, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm22
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm15, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm15, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm5, %zmm15
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm25
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm28, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm25
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm15
-; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm15, %ymm15
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm30, %zmm24, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm17 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm8, %ymm8
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm28, %zmm17, %zmm26
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %xmm24
-; AVX512F-NEXT:    vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %xmm24
-; AVX512F-NEXT:    vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm13
+; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm13, %ymm13
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %xmm28
+; AVX512F-NEXT:    vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm28 {%k1}
-; AVX512F-NEXT:    vmovdqa 1088(%rdi), %xmm9
-; AVX512F-NEXT:    vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %xmm9
-; AVX512F-NEXT:    vinserti128 $1, 1152(%rdi), %ymm9, %ymm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqa 1600(%rdi), %xmm4
-; AVX512F-NEXT:    vinserti128 $1, 1728(%rdi), %ymm4, %ymm4
-; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %xmm17
-; AVX512F-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm19, %zmm29, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm20 {%k1}
-; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %xmm19
-; AVX512F-NEXT:    vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19
-; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %xmm26
-; AVX512F-NEXT:    vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm29, %zmm20, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k1}
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %xmm4
+; AVX512F-NEXT:    vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %xmm7
+; AVX512F-NEXT:    vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm19 {%k1}
+; AVX512F-NEXT:    vmovdqa 1600(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
+; AVX512F-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512F-NEXT:    vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm29, %zmm19, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm23 {%k1}
+; AVX512F-NEXT:    vmovdqa 2112(%rdi), %xmm6
+; AVX512F-NEXT:    vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
+; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %xmm29
+; AVX512F-NEXT:    vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm23, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k1}
-; AVX512F-NEXT:    vmovdqa 2624(%rdi), %xmm12
-; AVX512F-NEXT:    vinserti128 $1, 2752(%rdi), %ymm12, %ymm12
-; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %xmm29
-; AVX512F-NEXT:    vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm21, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
+; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %xmm23
+; AVX512F-NEXT:    vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm18, %zmm18
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k1}
-; AVX512F-NEXT:    vmovdqa 3136(%rdi), %xmm6
-; AVX512F-NEXT:    vinserti128 $1, 3264(%rdi), %ymm6, %ymm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %xmm17
+; AVX512F-NEXT:    vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
 ; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %xmm20
 ; AVX512F-NEXT:    vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm21, %zmm16, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm1, %zmm15
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa 3648(%rdi), %xmm7
-; AVX512F-NEXT:    vinserti128 $1, 3776(%rdi), %ymm7, %ymm7
-; AVX512F-NEXT:    vmovdqa64 3584(%rdi), %xmm21
-; AVX512F-NEXT:    vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm22 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm22, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovdqa 3648(%rdi), %xmm9
+; AVX512F-NEXT:    vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
+; AVX512F-NEXT:    vmovdqa 3584(%rdi), %xmm14
+; AVX512F-NEXT:    vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm16, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm12 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm16, %zmm12, %zmm16
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm12 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm12, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm12 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm31 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm31, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm27 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm30 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm30, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm8, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm22 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm22, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm31 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm31, %zmm9
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm25 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm25, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm23 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm23, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm2, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, 448(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 384(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 384(%rsi)
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, 320(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 256(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 192(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 256(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 192(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 128(%rsi)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm1, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm26, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 256(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 384(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 384(%rdx)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 448(%rcx)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -10181,61 +10185,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 448(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    addq $6600, %rsp # imm = 0x19C8
+; AVX512F-NEXT:    addq $6664, %rsp # imm = 0x1A08
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride8_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $6600, %rsp # imm = 0x19C8
-; AVX512BW-NEXT:    vmovdqa64 3392(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 3328(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3456(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512BW-NEXT:    subq $6664, %rsp # imm = 0x1A08
+; AVX512BW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3328(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 3520(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 3456(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 3264(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10244,62 +10247,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
 ; AVX512BW-NEXT:    vmovdqa 3136(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 3072(%rdi), %ymm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2]
+; AVX512BW-NEXT:    vmovdqa 3072(%rdi), %ymm7
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm26
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2]
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %ymm20
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm22
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm19
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm30
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2]
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm21
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm13
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 1728(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vmovdqa 1664(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm21
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2]
+; AVX512BW-NEXT:    vmovdqa 1600(%rdi), %ymm4
+; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10310,61 +10311,64 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1216(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqa 1216(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 1152(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %ymm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %ymm24
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %ymm22
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm4
+; AVX512BW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 2752(%rdi), %ymm3
-; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqa 2688(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %ymm31
-; AVX512BW-NEXT:    vmovdqa 2560(%rdi), %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2]
+; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %ymm27
+; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %ymm26
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2]
+; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %ymm30
+; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %ymm18
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %ymm28
-; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %ymm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2]
-; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %ymm27
-; AVX512BW-NEXT:    vmovdqa 2048(%rdi), %ymm6
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vmovdqa 2240(%rdi), %ymm12
+; AVX512BW-NEXT:    vmovdqa 2176(%rdi), %ymm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
+; AVX512BW-NEXT:    vmovdqa 2112(%rdi), %ymm10
+; AVX512BW-NEXT:    vmovdqa 2048(%rdi), %ymm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
@@ -10372,807 +10376,801 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 3968(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 3904(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3840(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa 3776(%rdi), %ymm12
-; AVX512BW-NEXT:    vmovdqa 3712(%rdi), %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2]
-; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %ymm4
-; AVX512BW-NEXT:    vmovdqa 3584(%rdi), %ymm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %ymm17
+; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %ymm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm23[0],ymm17[0],ymm23[2],ymm17[2]
+; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm14
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm6 {%k1}
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm19[1],ymm22[1],ymm19[3],ymm22[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm25, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm6
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm19, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm12, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %zmm10
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6]
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 3648(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3584(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 3584(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm29, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm14
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm16
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm31
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm11
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm8
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm10
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm4, %zmm19
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm14, %zmm30
-; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm23, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm12, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm23, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm20
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm4, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm14, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm14, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm0
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm12, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm23, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm0, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm4, %zmm30
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm30
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm22
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm15, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm15, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm15
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm28, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm25
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm15
-; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm15, %ymm15
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm30, %zmm24, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm8
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm8, %ymm8
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm28, %zmm17, %zmm26
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %xmm24
-; AVX512BW-NEXT:    vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %xmm24
-; AVX512BW-NEXT:    vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm28 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %xmm9
-; AVX512BW-NEXT:    vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %xmm9
-; AVX512BW-NEXT:    vinserti128 $1, 1152(%rdi), %ymm9, %ymm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm28, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm13, %ymm13
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %xmm28
+; AVX512BW-NEXT:    vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, 1216(%rdi), %ymm4, %ymm4
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %xmm7
+; AVX512BW-NEXT:    vinserti128 $1, 1152(%rdi), %ymm7, %ymm7
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm19 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1600(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 1728(%rdi), %ymm5, %ymm5
+; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, 1664(%rdi), %ymm11, %ymm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm29, %zmm19, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm23 {%k1}
+; AVX512BW-NEXT:    vmovdqa 2112(%rdi), %xmm6
+; AVX512BW-NEXT:    vinserti128 $1, 2240(%rdi), %ymm6, %ymm6
+; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %xmm29
+; AVX512BW-NEXT:    vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm23, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1600(%rdi), %xmm4
-; AVX512BW-NEXT:    vinserti128 $1, 1728(%rdi), %ymm4, %ymm4
-; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %xmm17
-; AVX512BW-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm19, %zmm29, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %xmm19
-; AVX512BW-NEXT:    vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19
-; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %xmm26
-; AVX512BW-NEXT:    vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm29, %zmm20, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k1}
-; AVX512BW-NEXT:    vmovdqa 2624(%rdi), %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, 2752(%rdi), %ymm12, %ymm12
-; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %xmm29
-; AVX512BW-NEXT:    vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm21, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa 2624(%rdi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, 2752(%rdi), %ymm10, %ymm10
+; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %xmm23
+; AVX512BW-NEXT:    vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm18, %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa 3136(%rdi), %xmm6
-; AVX512BW-NEXT:    vinserti128 $1, 3264(%rdi), %ymm6, %ymm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17
 ; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %xmm20
 ; AVX512BW-NEXT:    vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm21, %zmm16, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm1, %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %xmm7
-; AVX512BW-NEXT:    vinserti128 $1, 3776(%rdi), %ymm7, %ymm7
-; AVX512BW-NEXT:    vmovdqa64 3584(%rdi), %xmm21
-; AVX512BW-NEXT:    vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm22, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12 {%k1}
+; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %xmm9
+; AVX512BW-NEXT:    vinserti128 $1, 3776(%rdi), %ymm9, %ymm9
+; AVX512BW-NEXT:    vmovdqa 3584(%rdi), %xmm14
+; AVX512BW-NEXT:    vinserti128 $1, 3712(%rdi), %ymm14, %ymm14
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm16, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm12 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm16, %zmm12, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm12 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm12, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm12 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm31 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm31, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm27 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm27, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm30 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm30, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm8, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm22, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm31 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm31, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm25 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm25, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm23, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm9[1],ymm14[3],ymm9[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 448(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 384(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 384(%rsi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, 320(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 192(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 256(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 192(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 128(%rsi)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 384(%rdx)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rcx)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -11258,20 +11256,20 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    addq $6600, %rsp # imm = 0x19C8
+; AVX512BW-NEXT:    addq $6664, %rsp # imm = 0x1A08
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <512 x i64>, ptr %in.vec, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index 6ef479b8754197..ab51f0bf9135cc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -599,174 +599,170 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-LABEL: store_i64_stride6_vf8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm3
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm1
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm0
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm4
 ; AVX512F-NEXT:    movb $12, %r10b
 ; AVX512F-NEXT:    kmovw %r10d, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
 ; AVX512F-NEXT:    movb $16, %r10b
 ; AVX512F-NEXT:    kmovw %r10d, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm5
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm4
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm8
 ; AVX512F-NEXT:    movb $48, %r9b
 ; AVX512F-NEXT:    kmovw %r9d, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm7, %zmm8
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm7
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm9
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm9, %zmm7
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14]
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm9, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k2}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm9, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm7, %zmm9
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm7, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm7, %zmm10
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm4, %zmm11
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm8, %zmm9
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm5
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm4, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm6, %zmm4
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm5, %zmm6
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm6, %zmm2
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm2
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm3, %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm6, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT:    vpermi2q %zmm5, %zmm0, %zmm2
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i64_stride6_vf8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm5
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm4
 ; AVX512BW-NEXT:    movb $12, %r10b
 ; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
 ; AVX512BW-NEXT:    movb $16, %r10b
 ; AVX512BW-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm4
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm8
 ; AVX512BW-NEXT:    movb $48, %r9b
 ; AVX512BW-NEXT:    kmovd %r9d, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm7, %zmm8
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm9, %zmm7
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm7, %zmm9
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm7, %zmm10
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm11
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm8, %zmm9
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm5
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm6, %zmm4
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1}
+; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm6, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm6, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
@@ -1445,20 +1441,20 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-LABEL: store_i64_stride6_vf16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm14
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm15
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm7
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm3
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm6
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
 ; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm10
@@ -1469,136 +1465,135 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    movb $16, %r10b
 ; AVX512F-NEXT:    kmovw %r10d, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512F-NEXT:    vpermi2q %zmm11, %zmm8, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm7, %zmm5, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm10
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm16
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm14
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm19, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm19, %zmm13
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm20, %zmm9
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm9
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm22, %zmm17
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14]
+; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm16, %zmm13
 ; AVX512F-NEXT:    movb $48, %r9b
 ; AVX512F-NEXT:    kmovw %r9d, %k2
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm13 {%k2}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm23, %zmm13
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm24, %zmm13
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm25, %zmm26
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10]
+; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm18, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm17 {%k2}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm26, %zmm17
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm27, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm19, %zmm28
+; AVX512F-NEXT:    vpermi2q %zmm15, %zmm12, %zmm19
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm20, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm21, %zmm19
+; AVX512F-NEXT:    vpermi2q %zmm15, %zmm12, %zmm22
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm16 {%k2}
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm24, %zmm16
+; AVX512F-NEXT:    vpermi2q %zmm15, %zmm12, %zmm25
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k2}
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm27, %zmm18
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9]
+; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm20, %zmm12
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm15
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %xmm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
-; AVX512F-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1}
+; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm12, %zmm12
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm20, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1}
+; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm8, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm15, %zmm8
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm11, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm5
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm5, %zmm5
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm20, %zmm5
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11]
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm6, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm11, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm20, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm6, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 64(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 448(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 256(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 320(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 512(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 640(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, 384(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i64_stride6_vf16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm6
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
 ; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm10
@@ -1609,116 +1604,115 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movb $16, %r10b
 ; AVX512BW-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm9
+; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm5, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm16
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm18, %zmm15
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm14
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm19, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm19, %zmm13
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm20, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm9
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm22, %zmm17
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [5,13,6,14,5,13,6,14]
+; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm16, %zmm13
 ; AVX512BW-NEXT:    movb $48, %r9b
 ; AVX512BW-NEXT:    kmovd %r9d, %k2
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm9
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm22
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm15
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k2}
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm12
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm21
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm17 {%k2}
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm17
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm19, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm13 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm23, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm24, %zmm13
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm25, %zmm26
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [1,9,2,10,1,9,2,10]
+; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm18, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm17 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm26, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm27, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm28
+; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm12, %zmm19
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm20, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm19
+; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm12, %zmm22
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm16 {%k2}
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm24, %zmm16
+; AVX512BW-NEXT:    vpermi2q %zmm15, %zmm12, %zmm25
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k2}
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm27, %zmm18
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9]
+; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm20, %zmm12
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm15
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %xmm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm18, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm20, %zmm18
-; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm13, %zmm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm19, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm20, %zmm19
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm21
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm22 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm22, %zmm13
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm21, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm0
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm24, %zmm7
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm11
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm25
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm25, %zmm7, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm22, %zmm5
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1}
+; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm12, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1}
+; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm8, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm15, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm5
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm7
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm20
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm20, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11]
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm6, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm20, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 64(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 512(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 512(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 640(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
@@ -3151,559 +3145,557 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-LABEL: store_i64_stride6_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $712, %rsp # imm = 0x2C8
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm18
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm5
-; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm12
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm29
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm27
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm26
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm25
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm20, %zmm0
+; AVX512F-NEXT:    subq $648, %rsp # imm = 0x288
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm19
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm20
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm24
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm7
+; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm21
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm13
+; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm12
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13]
+; AVX512F-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm27, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm27, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm20, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm27, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm20, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm29, %zmm11, %zmm27
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm14, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm29, %zmm8, %zmm20
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm15, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm13, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm10, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm13, %zmm14
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm10, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm13, %zmm16
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm28, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm16, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm28, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm28, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm11, %zmm26
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm12, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm12, %zmm13
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm12, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm11, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm25
-; AVX512F-NEXT:    vpermi2q %zmm22, %zmm7, %zmm11
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm3, %zmm23
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm6, %zmm24
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm2, %zmm30
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm19, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm16, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm16, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm14, %zmm30
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm19, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm18, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm18, %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm19, %zmm18
-; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm15, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm16, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm16, %zmm17
+; AVX512F-NEXT:    vpermi2q %zmm20, %zmm19, %zmm14
+; AVX512F-NEXT:    vpermi2q %zmm20, %zmm19, %zmm15
+; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm10
+; AVX512F-NEXT:    vpermi2q %zmm20, %zmm19, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm19
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm18, %zmm24, %zmm1
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10]
+; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm25, %zmm20
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14]
+; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm23
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm29, %zmm23
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm25, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm29, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm25, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm29, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm21, %zmm25
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm21
 ; AVX512F-NEXT:    movb $12, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm26 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
 ; AVX512F-NEXT:    movb $48, %al
 ; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm5 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm31 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k2}
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm31
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm31
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm30, %zmm24
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm24
-; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm8, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm6
-; AVX512F-NEXT:    vinserti32x4 $2, 128(%r8), %zmm7, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm5
-; AVX512F-NEXT:    vinserti32x4 $2, 192(%r8), %zmm2, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm13 {%k2}
+; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm26
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm25 {%k2}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm5, %zmm25
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm5, %zmm20
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm5, %zmm18
+; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm13
+; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm29 {%k2}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm5, %zmm29
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm5, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm5, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm29
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm5, %zmm15
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm5, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm5, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm15
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1}
+; AVX512F-NEXT:    vmovdqa 192(%rdx), %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1}
+; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm6, %zmm5
+; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm10, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm6, %zmm10
+; AVX512F-NEXT:    vinserti32x4 $2, 128(%r8), %zmm16, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm6, %zmm11
+; AVX512F-NEXT:    vinserti32x4 $2, 192(%r8), %zmm19, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm6, %zmm16
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm19, %zmm6
 ; AVX512F-NEXT:    movb $16, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT:    vpermt2q %zmm23, %zmm17, %zmm2
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm23
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm23, %zmm29, %zmm23
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm17, %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm19, %zmm6
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm0
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm17, %zmm30
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm7, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm26 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm28, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm17, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm17, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm22, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm28, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm17, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm22, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm28, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm17, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm22, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm28, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm17, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm22, %zmm7
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm19, %zmm1
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm21, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm24 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm19, %zmm2
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm19, %zmm6
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm21, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm19, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm21, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm19, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm21, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm19, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm24
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 1472(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 1408(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 1344(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 1280(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 1024(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 960(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm29, 1408(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 1280(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 1216(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 1088(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 1024(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 960(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, 896(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 832(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm29, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 512(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 1152(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, (%rax)
-; AVX512F-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 832(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 704(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 640(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 576(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 512(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, 256(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 1152(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 768(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512F-NEXT:    addq $648, %rsp # imm = 0x288
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i64_stride6_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $712, %rsp # imm = 0x2C8
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm25
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm20, %zmm0
+; AVX512BW-NEXT:    subq $648, %rsp # imm = 0x288
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm21
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm12
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13]
+; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm27, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm27, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm20, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm27, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm20, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm29, %zmm11, %zmm27
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm29, %zmm8, %zmm20
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm31
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm10, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm13, %zmm16
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm28, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm16, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm28, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm28, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm26
-; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm12, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm12, %zmm13
-; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm12, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm11, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm25
-; AVX512BW-NEXT:    vpermi2q %zmm22, %zmm7, %zmm11
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm3, %zmm23
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm24
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm30
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm19, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm16, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm16, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm30
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm19, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm18, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm18, %zmm6
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm19, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm16, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm16, %zmm17
+; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm14
+; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm10
+; AVX512BW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm19
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm24, %zmm1
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm25, %zmm20
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14]
+; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm29, %zmm23
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm29, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm25, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm29, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm21, %zmm25
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm21
 ; AVX512BW-NEXT:    movb $12, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm26 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
 ; AVX512BW-NEXT:    movb $48, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm5 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm31 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm26[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm30, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm24
-; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm8, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm6
-; AVX512BW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm7, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm5
-; AVX512BW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm2, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm26
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm29 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm29
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm15
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqa 192(%rdx), %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1}
+; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm11, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm5
+; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm10, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm10
+; AVX512BW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm16, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm6, %zmm11
+; AVX512BW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm19, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm6, %zmm16
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm19, %zmm6
 ; AVX512BW-NEXT:    movb $16, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm17, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm23, %zmm29, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm6
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm0
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm17, %zmm30
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm7, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm26 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm28, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm17, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm17, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm22, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm28, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm17, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm22, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm28, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm17, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm22, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm28, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm17, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm22, %zmm7
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm28 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm1
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm21, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm24 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm19, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm21, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm19, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm21, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm19, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm21, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm19, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm24
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1472(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 1408(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1344(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 1280(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 960(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1408(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1344(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 1280(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1216(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 1088(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 1024(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 960(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 896(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 832(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 512(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 1152(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%rax)
-; AVX512BW-NEXT:    addq $712, %rsp # imm = 0x2C8
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 704(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 640(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 512(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1152(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 768(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-NEXT:    addq $648, %rsp # imm = 0x288
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
@@ -6680,1355 +6672,1393 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-LABEL: store_i64_stride6_vf64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $3400, %rsp # imm = 0xD48
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm13
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512F-NEXT:    vmovdqa64 256(%rdx), %zmm9
-; AVX512F-NEXT:    vmovdqa64 320(%rdx), %zmm8
-; AVX512F-NEXT:    vmovdqa64 384(%rdx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 448(%rdx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm1
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm30
-; AVX512F-NEXT:    vmovdqa64 256(%rcx), %zmm27
-; AVX512F-NEXT:    vmovdqa64 320(%rcx), %zmm24
-; AVX512F-NEXT:    vmovdqa64 384(%rcx), %zmm22
-; AVX512F-NEXT:    vmovdqa64 448(%rcx), %zmm21
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm13, %zmm3
+; AVX512F-NEXT:    subq $3720, %rsp # imm = 0xE88
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm30
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm27
+; AVX512F-NEXT:    vmovdqa64 256(%rsi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 320(%rsi), %zmm13
+; AVX512F-NEXT:    vmovdqa64 384(%rsi), %zmm24
+; AVX512F-NEXT:    vmovdqa64 448(%rsi), %zmm0
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm3, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm30, %zmm11, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm12, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm3, %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm3, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm2, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm15, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm3, %zmm10
 ; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm3, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm12, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm15, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm3, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm14, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm3, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm12, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm2, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm15, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm3, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm15, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm3, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm29
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm15, %zmm29
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm1
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm15, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm23
+; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm18
+; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 256(%rdx), %zmm17
+; AVX512F-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 320(%rdx), %zmm15
+; AVX512F-NEXT:    vmovdqa64 320(%rcx), %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm21, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%rdx), %zmm13
+; AVX512F-NEXT:    vmovdqa64 384(%rcx), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm21, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm4, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm4, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 448(%rdx), %zmm16
+; AVX512F-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm21, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm4, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm21
+; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm5, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm14, %zmm5, %zmm1
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10]
+; AVX512F-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm27, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14]
+; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm28, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm6, %zmm5
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm27, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm6, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm28, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm20
+; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm27, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm28, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm23
+; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm28, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm18
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm28, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm3, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm27, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm28, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 448(%rsi), %zmm11
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13]
-; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 384(%rsi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm28, %zmm1
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 320(%rsi), %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm31
-; AVX512F-NEXT:    vmovdqa64 256(%rsi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm29
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm22
-; AVX512F-NEXT:    vpermi2q %zmm22, %zmm7, %zmm28
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm12, %zmm14
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm10, %zmm16
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9]
-; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm23, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm12, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm10, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm23, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm12, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm10, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm23, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm12, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm10, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm23, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm12, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm10, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm23, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm12, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm10, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm15, %zmm23, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm12, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm4, %zmm12
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm4, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm11, %zmm4, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm23, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm27, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm28, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm0, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm16, %zmm27
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm16, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm16
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $12, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm22 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm28 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm17 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm16 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm21 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm31 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
 ; AVX512F-NEXT:    movb $48, %al
 ; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm11 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm23 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm13 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm22 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm14 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm20 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm16 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm24 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm26 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k2}
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
+; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm29
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 256(%r8), %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 320(%r8), %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm26
+; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%r8), %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm24
+; AVX512F-NEXT:    vmovdqa64 448(%r8), %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm28
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%r8), %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%r8), %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm26
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%r8), %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%r8), %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm15 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm22
-; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm3
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm5
+; AVX512F-NEXT:    movb $16, %al
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm31
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm16 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm30
+; AVX512F-NEXT:    vmovdqa 256(%rdi), %ymm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm17 {%k2}
 ; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm26
+; AVX512F-NEXT:    vmovdqa 320(%rdi), %ymm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
+; AVX512F-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm10
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm19
+; AVX512F-NEXT:    vmovdqa 448(%rdi), %ymm2
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    movb $16, %al
-; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u>
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm1, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm22 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm20 {%k2}
+; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u>
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm3
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm4
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k2}
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm5
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k2}
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm3
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 256(%r9), %zmm11
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0 {%k2}
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa 320(%rdi), %ymm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm27
+; AVX512F-NEXT:    vmovdqa64 320(%r9), %zmm8
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k2}
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm1, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm25
-; AVX512F-NEXT:    vmovdqa 384(%rdi), %ymm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%r9), %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm6, %zmm24
+; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 448(%r9), %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm6, %zmm27
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0 {%k2}
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm20
-; AVX512F-NEXT:    vmovdqa 448(%rdi), %ymm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm1, %zmm18
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload
-; AVX512F-NEXT:    # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k2}
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm2, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm2, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k2}
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm10, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm7
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm10, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%r9), %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm10, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%r9), %zmm5
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%r9), %zmm8
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%r9), %zmm9
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm10, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm6, %zmm23
 ; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm10, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm24
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm10, %zmm19
-; AVX512F-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm10, %zmm26
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm6, %zmm28
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm10, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm6, %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm6, %zmm15
 ; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1}
-; AVX512F-NEXT:    vmovdqa 256(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1}
-; AVX512F-NEXT:    vmovdqa 320(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa 384(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa 448(%rdx), %xmm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1}
-; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm12, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm29, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm14, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm29, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti32x4 $2, 128(%r8), %zmm16, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm29, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti32x4 $2, 192(%r8), %zmm30, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm29, %zmm26
-; AVX512F-NEXT:    vinserti32x4 $2, 256(%r8), %zmm31, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm29, %zmm24
-; AVX512F-NEXT:    vinserti32x4 $2, 320(%r8), %zmm1, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm29, %zmm23
-; AVX512F-NEXT:    vinserti32x4 $2, 384(%r8), %zmm0, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm29, %zmm19
-; AVX512F-NEXT:    vinserti32x4 $2, 448(%r8), %zmm21, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm29, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm29, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm30, %zmm28
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm31, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm29, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm30, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm31, %zmm14
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm29, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm30, %zmm12
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm31, %zmm11
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1}
+; AVX512F-NEXT:    vmovdqa 128(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqa 192(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm29, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm30, %zmm7
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa 256(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovdqa 320(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm31, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa 384(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1}
+; AVX512F-NEXT:    vmovdqa 448(%rdx), %xmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm29, %zmm1
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vinserti32x4 $2, (%r8), %zmm0, %zmm24
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm24
+; AVX512F-NEXT:    vinserti32x4 $2, 64(%r8), %zmm14, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm23
+; AVX512F-NEXT:    vinserti32x4 $2, 128(%r8), %zmm15, %zmm21
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm21
+; AVX512F-NEXT:    vinserti32x4 $2, 192(%r8), %zmm10, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm20
+; AVX512F-NEXT:    vinserti32x4 $2, 256(%r8), %zmm12, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm18
+; AVX512F-NEXT:    vinserti32x4 $2, 320(%r8), %zmm2, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm17
+; AVX512F-NEXT:    vinserti32x4 $2, 384(%r8), %zmm9, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm15
+; AVX512F-NEXT:    vinserti32x4 $2, 448(%r8), %zmm1, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm14
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm1, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm31
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm30
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm26
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm25
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm30, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm16
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm29, %zmm27
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm30, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm31, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm29, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm30, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm31, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm29, %zmm18
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm30, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm31, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 3008(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm8, 2944(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 2880(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 2816(%rax)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm6, 2752(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, 2624(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm6, 2560(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 2496(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 2432(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 2368(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm25, 2240(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm5, 2176(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 2112(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm27, 2048(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm4, 1984(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 1856(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 3008(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 2944(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 2880(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 2816(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 2752(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 2624(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 2560(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 2496(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 2432(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 2240(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 2176(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 2112(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 2048(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 1728(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 1664(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm3, 1600(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 1472(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 1728(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm26, 1664(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1600(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 1408(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 1280(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm2, 1216(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 1088(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm29, 1344(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm30, 1280(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1088(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 960(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 896(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 832(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm14, 704(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 512(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 960(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 896(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 576(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 512(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 128(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 2688(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 2304(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 1920(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 1536(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 1152(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    addq $3400, %rsp # imm = 0xD48
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 2688(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 2304(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 1920(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 1536(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, 1152(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, 768(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, 384(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, (%rax)
+; AVX512F-NEXT:    addq $3720, %rsp # imm = 0xE88
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i64_stride6_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $3400, %rsp # imm = 0xD48
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 384(%rdx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 448(%rdx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %zmm21
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm3
+; AVX512BW-NEXT:    subq $3720, %rsp # imm = 0xE88
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 256(%rsi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %zmm0
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm11, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm3, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm3, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm10
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm12, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm15, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm14, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm2, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm15, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm15, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdx), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm21, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdx), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm21, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermi2q %zmm14, %zmm5, %zmm1
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10]
+; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm27, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14]
+; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm6, %zmm5
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm6, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm28, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm20
+; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm28, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm27, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm28, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm28, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm3, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm27, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm28, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %zmm11
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13]
-; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm28, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 256(%rsi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm22
-; AVX512BW-NEXT:    vpermi2q %zmm22, %zmm7, %zmm28
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm14
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm10, %zmm16
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9]
-; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm10, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm23, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm23, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm10, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm23, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm23, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm10, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm12, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm4, %zmm12
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm4, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm11, %zmm4, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm23, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm27, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm28, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm16, %zmm27
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm16, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $12, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm22 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm28 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm21 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm31 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
 ; AVX512BW-NEXT:    movb $48, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm11 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm23 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm13 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm22 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm14 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm20 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm16 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm24 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm26 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 256(%r8), %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%r8), %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm26
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%r8), %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 448(%r8), %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%r8), %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%r8), %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%r8), %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%r8), %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7>
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm5
+; AVX512BW-NEXT:    movb $16, %al
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm31
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm30
+; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm17 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm26
+; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm10
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm25
+; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    movb $16, %al
-; AVX512BW-NEXT:    kmovd %eax, %k2
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u>
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm0
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm19
+; AVX512BW-NEXT:    vmovdqa 448(%rdi), %ymm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm22 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm20 {%k2}
+; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u>
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm3
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 256(%r9), %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 320(%r9), %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm25
-; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%r9), %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm24
+; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%r9), %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm20
-; AVX512BW-NEXT:    vmovdqa 448(%rdi), %ymm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload
-; AVX512BW-NEXT:    # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm10, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm10, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%r9), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%r9), %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%r9), %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%r9), %zmm9
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm10, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm23
 ; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm24
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm10, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm26
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1}
-; AVX512BW-NEXT:    vmovdqa 256(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1}
-; AVX512BW-NEXT:    vmovdqa 320(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa 384(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa 448(%rdx), %xmm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1}
-; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm12, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm29, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm14, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm29, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm16, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm30, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm26
-; AVX512BW-NEXT:    vinserti32x4 $2, 256(%r8), %zmm31, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm29, %zmm24
-; AVX512BW-NEXT:    vinserti32x4 $2, 320(%r8), %zmm1, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm29, %zmm23
-; AVX512BW-NEXT:    vinserti32x4 $2, 384(%r8), %zmm0, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm29, %zmm19
-; AVX512BW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm21, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm29, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm30, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm29, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm31, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm30, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm31, %zmm11
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqa 128(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqa 192(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm30, %zmm7
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa 256(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1}
+; AVX512BW-NEXT:    vmovdqa 320(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm31, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa 384(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1}
+; AVX512BW-NEXT:    vmovdqa 448(%rdx), %xmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm29, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vinserti32x4 $2, (%r8), %zmm0, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm24
+; AVX512BW-NEXT:    vinserti32x4 $2, 64(%r8), %zmm14, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm23
+; AVX512BW-NEXT:    vinserti32x4 $2, 128(%r8), %zmm15, %zmm21
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm21
+; AVX512BW-NEXT:    vinserti32x4 $2, 192(%r8), %zmm10, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm20
+; AVX512BW-NEXT:    vinserti32x4 $2, 256(%r8), %zmm12, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm18
+; AVX512BW-NEXT:    vinserti32x4 $2, 320(%r8), %zmm2, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm17
+; AVX512BW-NEXT:    vinserti32x4 $2, 384(%r8), %zmm9, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm15
+; AVX512BW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm26
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm25
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm29, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm31, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm29, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm30, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm31, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm18
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm30, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm31, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 3008(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm8, 2944(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 2880(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 2816(%rax)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm6, 2752(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 2624(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm6, 2560(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 2496(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 2432(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 2368(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, 2240(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm5, 2176(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 2112(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, 2048(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm4, 1984(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 1856(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 3008(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 2944(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 2880(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 2816(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 2752(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 2624(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 2560(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 2496(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 2432(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 2368(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 2240(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 2176(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 2112(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 2048(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1984(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 1728(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 1664(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm3, 1600(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 1472(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 1728(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1664(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1600(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 1408(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 1280(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm2, 1216(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 1088(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1344(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1280(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1088(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 960(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 896(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 832(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 512(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 960(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 896(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 512(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 128(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 2688(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 2304(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1920(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1536(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1152(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT:    addq $3400, %rsp # imm = 0xD48
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 2688(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 2304(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1920(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1536(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1152(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 768(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%rax)
+; AVX512BW-NEXT:    addq $3720, %rsp # imm = 0xE88
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 0de7beea9398af..6fb5e43a2df68a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -750,123 +750,122 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-SLOW:       # %bb.0:
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    movb $48, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm5[0],zmm6[0],zmm5[2],zmm6[2],zmm5[4],zmm6[4],zmm5[6],zmm6[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    movb $96, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512F-ONLY-SLOW-NEXT:    movb $12, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%r10), %zmm8, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%r10), %zmm5, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm9, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-61, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $28, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm14
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $6, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm13
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm11, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm14
 ; AVX512F-ONLY-SLOW-NEXT:    movb $56, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm4, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm11, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    movb $120, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    movb $48, %cl
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
@@ -875,122 +874,121 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm7, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1]
-; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    movb $96, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm7, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%r10), %zmm8, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%r10), %zmm9, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    movb $112, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm9, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm9, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm10, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    movb $-61, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm4, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm9 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4]
+; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm7, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    movb $48, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,3,7,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %ymm9, %ymm12, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %ymm11, %ymm12, %ymm13
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $14, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm10 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm0, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm4, %zmm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
 ; AVX512F-ONLY-FAST-NEXT:    movb $28, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm5, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm12
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $6, %cl
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm10, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm12, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    movb $56, %cl
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm9 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm10, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    movb $120, %cl
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 64(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 384(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512F-ONLY-FAST-NEXT:    retq
 ;
@@ -998,122 +996,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-SLOW:       # %bb.0:
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm3
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm9, %zmm10, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm12
 ; AVX512DQ-SLOW-NEXT:    movb $-61, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
 ; AVX512DQ-SLOW-NEXT:    movb $96, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r8), %ymm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2]
 ; AVX512DQ-SLOW-NEXT:    movb $28, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm12
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm13
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm13, %zmm9
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2]
 ; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm11
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm15
+; AVX512DQ-SLOW-NEXT:    movb $48, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm4
 ; AVX512DQ-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm10, %zmm9
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm11
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm12, %zmm11
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm9
 ; AVX512DQ-SLOW-NEXT:    movb $56, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm12
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm14 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-SLOW-NEXT:    movb $12, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm4
 ; AVX512DQ-SLOW-NEXT:    movb $112, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k1}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%r10), %zmm4, %zmm6 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5]
 ; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm0
 ; AVX512DQ-SLOW-NEXT:    movb $120, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    movb $48, %cl
-; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 320(%rax)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
@@ -1122,121 +1118,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm3
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm9, %zmm5
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm6
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm5
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm8, %zmm9
 ; AVX512DQ-FAST-NEXT:    movb $-61, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm8
 ; AVX512DQ-FAST-NEXT:    movb $48, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,3,7,u>
-; AVX512DQ-FAST-NEXT:    vpermi2q %ymm10, %ymm11, %ymm7
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,3,7,u>
+; AVX512DQ-FAST-NEXT:    vpermi2q %ymm10, %ymm11, %ymm9
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FAST-NEXT:    movb $14, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1}
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm12
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm7, %zmm5, %zmm9
 ; AVX512DQ-FAST-NEXT:    movb $96, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm9 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
 ; AVX512DQ-FAST-NEXT:    movb $28, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3]
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm9, %zmm8, %zmm11
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm9, %zmm8, %zmm12
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm9, %zmm8, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm10, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm12
 ; AVX512DQ-FAST-NEXT:    movb $24, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm9, %zmm8
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm10
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm12 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm12, %zmm10
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512DQ-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    movb $6, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm12
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm12, %zmm13
 ; AVX512DQ-FAST-NEXT:    movb $56, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k2}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k2}
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm13
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQ-FAST-NEXT:    movb $12, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512DQ-FAST-NEXT:    movb $112, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2}
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
-; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm3
 ; AVX512DQ-FAST-NEXT:    movb $120, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 384(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 320(%rax)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
@@ -1244,123 +1239,122 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm5[0],zmm6[0],zmm5[2],zmm6[2],zmm5[4],zmm6[4],zmm5[6],zmm6[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm3, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $96, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%r10), %zmm8, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%r10), %zmm5, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm9, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-61, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $28, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $6, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm13
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm11, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm14
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $56, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm4, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm11, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $120, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %cl
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm6[1],ymm9[3],ymm6[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
@@ -1369,122 +1363,121 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm7, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm6, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    movb $96, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm7, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
 ; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%r10), %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%r10), %zmm9, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    movb $112, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm9, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm9, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm10, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm4, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm9 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm7, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    movb $48, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm11
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm12
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,3,7,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %ymm9, %ymm12, %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %ymm11, %ymm12, %ymm13
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $14, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm10 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm0, %zmm13
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm7, %zmm4, %zmm14
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $28, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm5, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm12
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $6, %cl
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm10, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm12, %zmm13
 ; AVX512BW-ONLY-FAST-NEXT:    movb $56, %cl
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm9 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm10, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    movb $120, %cl
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 64(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 384(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512BW-ONLY-FAST-NEXT:    retq
 ;
@@ -1492,122 +1485,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-SLOW:       # %bb.0:
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm9, %zmm10, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm12
 ; AVX512DQBW-SLOW-NEXT:    movb $-61, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    movb $96, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2]
 ; AVX512DQBW-SLOW-NEXT:    movb $28, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1]
-; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm13[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [15,7,15,7,15,7,15,7]
 ; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm13, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2]
 ; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm10, %zmm9, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm15
+; AVX512DQBW-SLOW-NEXT:    movb $48, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm10, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm11
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm12, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1]
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    movb $56, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm12
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm14 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQBW-SLOW-NEXT:    movb $12, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    movb $112, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%r10), %zmm4, %zmm6 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5]
 ; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    movb $120, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    movb $48, %cl
-; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 320(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vzeroupper
 ; AVX512DQBW-SLOW-NEXT:    retq
 ;
@@ -1616,121 +1607,120 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm3
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm9, %zmm5
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm5, %zmm6
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm5
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm8, %zmm9
 ; AVX512DQBW-FAST-NEXT:    movb $-61, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm8
 ; AVX512DQBW-FAST-NEXT:    movb $48, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%r9), %ymm10
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,3,7,u>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %ymm10, %ymm11, %ymm7
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,3,7,u>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %ymm10, %ymm11, %ymm9
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $14, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3]
 ; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm12
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm7, %zmm5, %zmm9
 ; AVX512DQBW-FAST-NEXT:    movb $96, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm9 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
 ; AVX512DQBW-FAST-NEXT:    movb $28, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3]
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7]
 ; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1]
-; AVX512DQBW-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm9, %zmm8, %zmm11
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm9, %zmm8, %zmm12
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm9, %zmm8, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm10, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm12
 ; AVX512DQBW-FAST-NEXT:    movb $24, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm9, %zmm8
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm10
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm12 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm12, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512DQBW-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $6, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm12
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm12, %zmm13
 ; AVX512DQBW-FAST-NEXT:    movb $56, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k2}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k2}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm13
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512DQBW-FAST-NEXT:    movb $12, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512DQBW-FAST-NEXT:    movb $112, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
-; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm3
 ; AVX512DQBW-FAST-NEXT:    movb $120, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 320(%rax)
 ; AVX512DQBW-FAST-NEXT:    vzeroupper
 ; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
@@ -2509,841 +2499,823 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm9, %zmm18, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm17, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm29, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    movb $48, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $64, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm23 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm20, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm11, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    movb $96, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm17, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm17, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm8, %zmm22
 ; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm22, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm18, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm19, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm3, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    movb $96, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <u,1,2,3,4,15,u,u>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm11, %zmm16, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm13, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm29, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm31, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm17, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm14, %zmm25, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm25, %zmm14, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm28, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm28, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,1,2,3,4,5,15,u>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm27, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm23, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm24
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2]
-; AVX512F-ONLY-SLOW-NEXT:    movb $28, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm22, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm0, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm29, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %ymm26
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm17 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm11, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm21, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm25, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm1, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    movb $48, %sil
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,12,7,0,1,12,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm9, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm12 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %ymm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm20, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm30, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm20, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm23
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm23[0],ymm31[0],ymm23[2],ymm31[2]
+; AVX512F-ONLY-SLOW-NEXT:    movb $28, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm29, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm30, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm9, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm13 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k3} = zmm29[2,3,2,3],zmm28[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm30, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm28[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm28
 ; AVX512F-ONLY-SLOW-NEXT:    movb $12, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm21, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm27, %zmm27
 ; AVX512F-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm20, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm4 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm27
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm0, %zmm5 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm26, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm5 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $120, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $6, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $56, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm6 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-31, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm10 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-61, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm13 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm17 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 512(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 576(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 704(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 448(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 640(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 768(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 832(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf16:
 ; AVX512F-ONLY-FAST:       # %bb.0:
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm21, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm16
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm17, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm17, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    movb $64, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm22, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    movb $112, %sil
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm11, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm7, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm7, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm19, %zmm26, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4]
+; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    movb $48, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm30, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm13, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm24, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm24, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm27, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm31, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm11, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm17, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm18, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm14, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm18, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm15, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm18[0],zmm2[0],zmm18[2],zmm2[2],zmm18[4],zmm2[4],zmm18[6],zmm2[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm18, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm31, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm31, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [1,3,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm23, %ymm15, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    movb $96, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm26 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm27, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    movb $120, %sil
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm23, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm14, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm23, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    movb $24, %dil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %edi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    movb $-31, %dil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %edi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm15, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm23, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    movb $28, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k3} = zmm5[2,3,2,3],zmm12[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm28, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm1, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    movb $-61, %dil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %edi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm12, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm31, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm3 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %ymm27
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %ymm23, %ymm27, %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm23 = ymm27[0],ymm23[0],ymm27[2],ymm23[2]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k3} = zmm23[2,3,2,3],zmm12[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm26, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    movb $48, %sil
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm24, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm24 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm11, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm27, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm22, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    movb $112, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    movb $120, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %ymm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %ymm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %ymm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,3,7,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm28, %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm27, %ymm6, %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm18 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $14, %sil
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm17, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %ymm24, %ymm8, %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm27, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm17 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    movb $28, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    movb $6, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1]
-; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    movb $64, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm18 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    movb $56, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm12 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    movb $6, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    movb $56, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm19 {%k6}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    movb $-31, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    movb $-61, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm4, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,1,2,3,4,5,15,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm5, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm18, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm4, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k6}
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 512(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 576(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 704(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 448(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 832(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 448(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512F-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16:
 ; AVX512DQ-SLOW:       # %bb.0:
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm11
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm12
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm16, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm14
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    movb $64, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm23 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm28
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm16
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14]
 ; AVX512DQ-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm21
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm17, %zmm21
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    movb $96, %sil
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm19
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm23
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2]
+; AVX512DQ-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm15, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm12, %zmm18, %zmm5
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT:    movb $48, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm17, %zmm18
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    movb $64, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm24, %zmm11, %zmm17
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm26
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm18, %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm18, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm22
 ; AVX512DQ-SLOW-NEXT:    movb $24, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm16 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <u,1,2,3,4,15,u,u>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm16, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm11
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7]
-; AVX512DQ-SLOW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm30, %zmm11
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm22, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm18, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm19, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm19
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8]
 ; AVX512DQ-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm31, %zmm0
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm18, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    movb $96, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm29, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm21, %zmm13
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3]
 ; AVX512DQ-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm14, %zmm25, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm25, %zmm14, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm29, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r9), %ymm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,1,2,3,4,5,15,u>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm24, %zmm27, %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm24, %zmm23, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm23
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm29, %zmm23
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm31, %zmm16
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm31, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %ymm23
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm28, %zmm20, %zmm9
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm20, %zmm28, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm28, %zmm20, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm28
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2]
 ; AVX512DQ-SLOW-NEXT:    movb $28, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7]
-; AVX512DQ-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm26, %zmm22, %zmm29
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm22, %zmm0, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm30, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %ymm26
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm13, %zmm4, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm18 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm24, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm24, %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm28, %zmm25, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm28, %zmm1, %zmm24
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm25
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm28, %zmm9
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-SLOW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm30, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm30
-; AVX512DQ-SLOW-NEXT:    movb $48, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm13, %zmm4, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm13, %zmm4, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm13, %zmm28
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm30, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm27
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm9, %zmm27, %zmm31
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm7, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm11 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r8), %ymm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm30, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm27
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm27
 ; AVX512DQ-SLOW-NEXT:    movb $12, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5}
 ; AVX512DQ-SLOW-NEXT:    movb $112, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k7
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2 {%k2}
 ; AVX512DQ-SLOW-NEXT:    movb $120, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k3}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4}
 ; AVX512DQ-SLOW-NEXT:    movb $56, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm9 {%k6}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5}
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm10 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k6}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5}
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    movb $-31, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm10 {%k2}
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $-61, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm15 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm12 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm28 {%k6}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm10 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21 {%k3}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k6}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 448(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 512(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 576(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 640(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 704(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 768(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 832(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 832(%rax)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf16:
 ; AVX512DQ-FAST:       # %bb.0:
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm30
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm21
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm18, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm18, %zmm5
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    movb $64, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7]
 ; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm17
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm18
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm21, %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm31
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm22
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm20, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm20, %zmm16
+; AVX512DQ-FAST-NEXT:    movb $24, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm21, %zmm16, %zmm31
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm22, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm27
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
 ; AVX512DQ-FAST-NEXT:    movb $12, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2}
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm12
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm16, %zmm0, %zmm8 {%k3}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm26, %zmm28
 ; AVX512DQ-FAST-NEXT:    movb $112, %sil
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm17, %zmm16
+; AVX512DQ-FAST-NEXT:    movb $48, %dil
+; AVX512DQ-FAST-NEXT:    kmovw %edi, %k2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,0,10,2,1,0,10,2]
+; AVX512DQ-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3}
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3}
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm25
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm25
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm28, %zmm8 {%k3}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm28, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm28, %zmm24
 ; AVX512DQ-FAST-NEXT:    movb $96, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm25 {%k1}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm10, %zmm30, %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm12, %zmm19
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7]
-; AVX512DQ-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm27, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm26, %zmm21
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3}
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm24 {%k3}
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm21, %zmm22
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm29, %zmm22
 ; AVX512DQ-FAST-NEXT:    movb $120, %sil
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm3, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2}
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3}
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm22
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7]
-; AVX512DQ-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm22, %zmm14, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm24, %zmm29
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm26 = [15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # ymm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm22
-; AVX512DQ-FAST-NEXT:    movb $24, %dil
-; AVX512DQ-FAST-NEXT:    kmovw %edi, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm20, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm20, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm7, %zmm20
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-FAST-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm9, %zmm30
 ; AVX512DQ-FAST-NEXT:    movb $-31, %dil
-; AVX512DQ-FAST-NEXT:    kmovw %edi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k3}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm11, %zmm15, %zmm28
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm14, %zmm28, %zmm29
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm1, %zmm20
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm21
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    kmovw %edi, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm18
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm7, %zmm12, %zmm20
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-FAST-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm9, %zmm20, %zmm30
 ; AVX512DQ-FAST-NEXT:    movb $-61, %dil
-; AVX512DQ-FAST-NEXT:    kmovw %edi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20 {%k3}
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm19 {%k3}
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm25, %zmm21
-; AVX512DQ-FAST-NEXT:    movb $48, %sil
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm28
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm28
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm15, %zmm11, %zmm12
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm27, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm12 {%k3}
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %ymm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %ymm27
+; AVX512DQ-FAST-NEXT:    kmovw %edi, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm28, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm28 {%k3}
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm29, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %ymm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %ymm24
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %ymm28
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,3,7,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm28, %ymm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm27, %ymm6, %ymm8
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,3,7,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm28, %ymm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm24, %ymm4, %ymm11
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FAST-NEXT:    movb $14, %sil
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6]
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512DQ-FAST-NEXT:    vpermi2q %ymm23, %ymm8, %ymm6
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3}
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm27, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm16
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm14, %zmm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6]
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512DQ-FAST-NEXT:    vpermi2q %ymm20, %ymm11, %ymm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm4, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm25, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm28[0],ymm24[0],ymm28[2],ymm24[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k3}
 ; AVX512DQ-FAST-NEXT:    movb $28, %al
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm13
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm6
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    movb $6, %al
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm8
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm11
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm8
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm6
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    movb $64, %al
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm4, %zmm27
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm4, %zmm27
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm4
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1}
 ; AVX512DQ-FAST-NEXT:    movb $56, %al
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm24, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,5,15,u>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm10, %zmm5, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm10, %zmm17, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm31, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm31, %zmm3, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm31, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm4
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 448(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 512(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 576(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 704(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 832(%rax)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
@@ -3351,841 +3323,823 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16:
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm9, %zmm18, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm16, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm17, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm29, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $64, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm23 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm20, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm11, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    movb $96, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm17, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm17, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm8, %zmm22
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm22, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm18, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm19, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm3, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    movb $96, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <u,1,2,3,4,15,u,u>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm11, %zmm16, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm13, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm29, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm31, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm17, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm14, %zmm25, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm25, %zmm14, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm28, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm28, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,1,2,3,4,5,15,u>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm27, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm23, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm24
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2]
-; AVX512BW-ONLY-SLOW-NEXT:    movb $28, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm22, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm0, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm29, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %ymm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm17 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm11, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm21, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm25, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm1, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm29, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,12,7,0,1,12,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm9, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm12 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %ymm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm20, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm30, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm20, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm23[0],ymm31[0],ymm23[2],ymm31[2]
+; AVX512BW-ONLY-SLOW-NEXT:    movb $28, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm29, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm30, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm9, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm13 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k3} = zmm29[2,3,2,3],zmm28[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm30, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm28[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm28
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm21, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm27, %zmm27
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm20, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm4 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm27
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm0, %zmm5 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm26, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm5 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $120, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm11 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $6, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $56, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm6 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-31, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm10 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-61, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm29 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm13 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm17 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 512(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 576(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 704(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 448(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 640(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 768(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 832(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf16:
 ; AVX512BW-ONLY-FAST:       # %bb.0:
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm21, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm17, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm17, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    movb $64, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm22, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    movb $112, %sil
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm11, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm11, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm7, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm7, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm19, %zmm26, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    movb $48, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm30, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm13, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm24, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm24, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm27, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm31, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm11, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm17, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm18, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm14, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm18, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm15, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm18[0],zmm2[0],zmm18[2],zmm2[2],zmm18[4],zmm2[4],zmm18[6],zmm2[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm18, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm31, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm31, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %ymm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [1,3,7,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm23, %ymm15, %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    movb $96, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm26 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm27, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    movb $120, %sil
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm23, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm14, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm23, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    movb $24, %dil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %edi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    movb $-31, %dil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %edi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm15, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm23, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    movb $28, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k3} = zmm5[2,3,2,3],zmm12[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm28, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm17, %zmm1, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %dil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %edi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm12, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm31, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %ymm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm3 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %ymm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %ymm23, %ymm27, %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm23 = ymm27[0],ymm23[0],ymm27[2],ymm23[2]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k3} = zmm23[2,3,2,3],zmm12[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm26, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    movb $48, %sil
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm24, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm24 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm15, %zmm11, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm27, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm22, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    movb $112, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm7, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    movb $120, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %ymm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %ymm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %ymm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,3,7,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %ymm28, %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm27, %ymm6, %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm18 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $14, %sil
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm17, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %ymm24, %ymm8, %ymm6
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm27, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm17 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    movb $28, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm6
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    movb $6, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm6
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    movb $64, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm18 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    movb $56, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm12 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    movb $6, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k5
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    movb $56, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm19 {%k6}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    movb $-31, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm4, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,1,2,3,4,5,15,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm5, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm18, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm4, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm13 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k6}
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 512(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 576(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 704(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 448(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 832(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 448(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512BW-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16:
 ; AVX512DQBW-SLOW:       # %bb.0:
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm11
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm16, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    movb $64, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm23 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm28
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14]
 ; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm17, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    movb $96, %sil
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2]
+; AVX512DQBW-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm15, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm12, %zmm18, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT:    movb $48, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm17, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    movb $64, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm24, %zmm11, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm18, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm18, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm22
 ; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm16 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <u,1,2,3,4,15,u,u>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm16, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm30, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm22, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm18, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm19, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8]
 ; AVX512DQBW-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm31, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm18, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    movb $96, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm29, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm21, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3]
 ; AVX512DQBW-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm14, %zmm25, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm25, %zmm14, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm29, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r9), %ymm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,1,2,3,4,5,15,u>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm24, %zmm27, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm24, %zmm23, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm23
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm29, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm31, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1]
+; AVX512DQBW-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm31, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %ymm23
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm28, %zmm20, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm20, %zmm28, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm28, %zmm20, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm31, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm28
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2]
 ; AVX512DQBW-SLOW-NEXT:    movb $28, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm26, %zmm22, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm22, %zmm0, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm30, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %ymm26
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm13, %zmm4, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm18 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm24, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm24, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm28, %zmm25, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm28, %zmm1, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm28, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3]
 ; AVX512DQBW-SLOW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm30, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm6, %zmm30
-; AVX512DQBW-SLOW-NEXT:    movb $48, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm13, %zmm4, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm13, %zmm4, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm13, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm30, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm9, %zmm27, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm7, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm11 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm30, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %xmm27
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm27
 ; AVX512DQBW-SLOW-NEXT:    movb $12, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5}
 ; AVX512DQBW-SLOW-NEXT:    movb $112, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k7
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2 {%k2}
 ; AVX512DQBW-SLOW-NEXT:    movb $120, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm11 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4}
 ; AVX512DQBW-SLOW-NEXT:    movb $56, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm9 {%k6}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm10 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k6}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm6 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    movb $-31, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm10 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $-61, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm0 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm15 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm12 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm28 {%k6}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm10 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k6}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 448(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 512(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 576(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 640(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 704(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 768(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 832(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 832(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vzeroupper
 ; AVX512DQBW-SLOW-NEXT:    retq
 ;
 ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf16:
 ; AVX512DQBW-FAST:       # %bb.0:
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm30
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm21
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm18, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm18, %zmm5
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    movb $64, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7]
 ; AVX512DQBW-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm17
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm17
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm21, %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm31
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm22
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm20, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm20, %zmm16
+; AVX512DQBW-FAST-NEXT:    movb $24, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm21, %zmm16, %zmm31
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm22, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm27
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
 ; AVX512DQBW-FAST-NEXT:    movb $12, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm12
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm16, %zmm0, %zmm8 {%k3}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm26, %zmm28
 ; AVX512DQBW-FAST-NEXT:    movb $112, %sil
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm17, %zmm16
+; AVX512DQBW-FAST-NEXT:    movb $48, %dil
+; AVX512DQBW-FAST-NEXT:    kmovd %edi, %k2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,0,10,2,1,0,10,2]
+; AVX512DQBW-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3}
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm25
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm28, %zmm8 {%k3}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm28, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm28, %zmm24
 ; AVX512DQBW-FAST-NEXT:    movb $96, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm25 {%k1}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm10, %zmm30, %zmm24
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm12, %zmm19
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7]
-; AVX512DQBW-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm27, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm26, %zmm21
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3}
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm24 {%k3}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm21, %zmm22
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm29, %zmm22
 ; AVX512DQBW-FAST-NEXT:    movb $120, %sil
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm3, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2}
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm24, %zmm22
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7]
-; AVX512DQBW-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm22, %zmm14, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm24, %zmm29
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm26 = [15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # ymm26 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm22
-; AVX512DQBW-FAST-NEXT:    movb $24, %dil
-; AVX512DQBW-FAST-NEXT:    kmovd %edi, %k2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm20, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm20, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm7, %zmm20
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-FAST-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm9, %zmm30
 ; AVX512DQBW-FAST-NEXT:    movb $-31, %dil
-; AVX512DQBW-FAST-NEXT:    kmovd %edi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k3}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm11, %zmm15, %zmm28
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7]
-; AVX512DQBW-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm14, %zmm28, %zmm29
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm1, %zmm20
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm21
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    kmovd %edi, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm18, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm18
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm7, %zmm12, %zmm20
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-FAST-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm9, %zmm20, %zmm30
 ; AVX512DQBW-FAST-NEXT:    movb $-61, %dil
-; AVX512DQBW-FAST-NEXT:    kmovd %edi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20 {%k3}
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm19 {%k3}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm25, %zmm21
-; AVX512DQBW-FAST-NEXT:    movb $48, %sil
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm11, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm27, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm12 {%k3}
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %ymm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %ymm27
+; AVX512DQBW-FAST-NEXT:    kmovd %edi, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm28, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm28 {%k3}
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm29, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %ymm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %ymm24
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %ymm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,3,7,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm28, %ymm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm27, %ymm6, %ymm8
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,3,7,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm28, %ymm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm24, %ymm4, %ymm11
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $14, %sil
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm16, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6]
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm8
-; AVX512DQBW-FAST-NEXT:    vpermi2q %ymm23, %ymm8, %ymm6
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3}
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3]
-; AVX512DQBW-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm27, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm16
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm14, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6]
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpermi2q %ymm20, %ymm11, %ymm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm4, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm25, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm28[0],ymm24[0],ymm28[2],ymm24[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k3}
 ; AVX512DQBW-FAST-NEXT:    movb $28, %al
 ; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm6
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $6, %al
 ; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm8
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm6
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    movb $64, %al
-; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1]
+; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm4, %zmm27
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm4, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm4
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1}
 ; AVX512DQBW-FAST-NEXT:    movb $56, %al
 ; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm24, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,5,15,u>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm10, %zmm5, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm10, %zmm17, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm31, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm31, %zmm3, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm31, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm4
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 448(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 512(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 576(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 704(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 832(%rax)
 ; AVX512DQBW-FAST-NEXT:    vzeroupper
 ; AVX512DQBW-FAST-NEXT:    retq
@@ -5754,3539 +5708,3517 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf32:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
-; AVX512F-ONLY-SLOW-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512F-ONLY-SLOW-NEXT:    subq $2184, %rsp # imm = 0x888
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm29
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    movb $96, %r10b
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm10, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $28, %r10b
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm22, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm20, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm17, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm16, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm22, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm10, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm22, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm20, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm19, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm17, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %ymm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%r8), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm0, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm21, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%r9), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm30[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm20, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm1, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm30
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm24, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm24, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm4, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm5, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm4, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm18, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    movb $48, %r10b
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm6, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm6, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm22, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm22, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    movb $12, %sil
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm19, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    movb $48, %r10b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm7[0],zmm2[0],zmm7[2],zmm2[2],zmm7[4],zmm2[4],zmm7[6],zmm2[6]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm18, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm18, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm24, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm10, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    movb $12, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm22, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm17 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm10, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm10, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm6 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm27 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm1, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm23 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm1, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $120, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm24 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-61, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    movb $-31, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm9 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm9 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $6, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    movb $56, %cl
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm15 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm18 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    movb $-31, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm15 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm18 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    movb $56, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm26 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm21 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm19 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    movb $64, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm21, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm12, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm6[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    movb $64, %cl
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm10 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $8, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm13, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,1,2,3,4,15,u,u>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm22, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm12, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm10, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,1,2,3,4,5,15,u>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm13, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm14, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm12, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm10, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm25, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm31, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm8, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 1472(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1408(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 1280(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1216(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1152(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 960(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 832(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 768(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 704(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 512(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm3, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 960(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 832(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 1344(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 896(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 640(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 448(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1728(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 1664(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 1600(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1536(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 896(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 448(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $2184, %rsp # imm = 0x888
 ; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf32:
 ; AVX512F-ONLY-FAST:       # %bb.0:
-; AVX512F-ONLY-FAST-NEXT:    subq $2024, %rsp # imm = 0x7E8
+; AVX512F-ONLY-FAST-NEXT:    subq $2152, %rsp # imm = 0x868
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm30
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    movb $96, %r10b
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm25, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%r9), %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %ymm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %ymm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %ymm22
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%r9), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%r8), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
 ; AVX512F-ONLY-FAST-NEXT:    movb $28, %r10b
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k2
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,7,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm1, %ymm3, %ymm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,3,7,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm1, %ymm5, %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm6, %ymm3, %ymm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm2, %ymm3, %ymm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm14, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm19, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm16, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm2, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm14, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm4, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm10, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm6, %ymm5, %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm4, %ymm5, %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm25, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm15, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm8, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm9, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm28, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm5, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm1, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm28, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm0, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm4, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    movb $48, %r10b
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k3
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm31, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm4, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm12, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm11, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm3, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm25, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm21, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm21, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm28, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm4, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm21, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm31, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm4, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm4, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm21, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm15, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm28, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm24, %zmm0, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm24, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm31, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm26, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm26, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm19, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm19, %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm19, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm4, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm21, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm15, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm4, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm21, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm27, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm21, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm21, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm24, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm9, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4}
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm3, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm9, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    movb $112, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm3, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm3, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm21 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm5, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm18 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm2, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $14, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    movb $120, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm6 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    movb $-61, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k5}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    movb $120, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k5}
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k5}
-; AVX512F-ONLY-FAST-NEXT:    movb $-31, %sil
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    movb $-61, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm26 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $6, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    movb $56, %cl
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    movb $-31, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    movb $56, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    movb $64, %cl
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm22, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm9, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm12, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm9, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    movb $64, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    movb $8, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm19 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm6, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,15,u,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm19, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm9, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm8, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,15,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm6, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm24, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1472(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1408(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1280(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1216(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1152(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1088(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm1, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 960(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 832(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 768(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 704(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 640(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 192(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 896(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, 1344(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 896(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 448(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 1728(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1664(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1536(%rax)
-; AVX512F-ONLY-FAST-NEXT:    addq $2024, %rsp # imm = 0x7E8
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $2152, %rsp # imm = 0x868
 ; AVX512F-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512F-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf32:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512DQ-SLOW-NEXT:    subq $2184, %rsp # imm = 0x888
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm10
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm4
 ; AVX512DQ-SLOW-NEXT:    movb $96, %r10b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm18
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r9), %ymm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm12
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm8, (%rsp) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r9), %ymm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r8), %ymm11
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
 ; AVX512DQ-SLOW-NEXT:    movb $28, %r10b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k2
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-SLOW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm19
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm19
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm18, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm17, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm19
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm16, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm8, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm18, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r9), %ymm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r8), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm3, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm7, %zmm22
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm29
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm25, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r9), %ymm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r8), %ymm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm14[0],ymm5[2],ymm14[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[2,3,2,3],zmm15[2,3,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm0, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm26
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm3, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm17, %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm22
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm16
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm12, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm17, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm3, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm22
 ; AVX512DQ-SLOW-NEXT:    movb $48, %r10b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k3
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm4, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2]
+; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm28
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm25, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm17, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm14
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm18
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm23
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm31, %zmm24, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm31, %zmm24, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm2, %zmm24
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm19, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm19, %zmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
-; AVX512DQ-SLOW-NEXT:    movb $120, %sil
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm12
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm22, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm3, %zmm22
+; AVX512DQ-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm4, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    movb $120, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm11 {%k4}
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm3 = zmm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k4}
 ; AVX512DQ-SLOW-NEXT:    movb $-61, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm23 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    movb $24, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k3}
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k4}
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm30[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k4}
-; AVX512DQ-SLOW-NEXT:    movb $-31, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm9 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm9 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    movb $12, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4}
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm5
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm5
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm5
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4}
-; AVX512DQ-SLOW-NEXT:    movb $112, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k3}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm5, %zmm1
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k3}
+; AVX512DQ-SLOW-NEXT:    movb $-31, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k3}
+; AVX512DQ-SLOW-NEXT:    movb $112, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm2, %zmm27 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm2
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
-; AVX512DQ-SLOW-NEXT:    movb $56, %cl
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm19, %zmm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3}
+; AVX512DQ-SLOW-NEXT:    movb $56, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k3}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k3}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k3}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4}
+; AVX512DQ-SLOW-NEXT:    movb $64, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm5, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm23 {%k1}
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm16, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm22, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %ymm16
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm19 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm24 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm14 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%r8), %ymm6
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    movb $64, %cl
-; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1}
 ; AVX512DQ-SLOW-NEXT:    movb $8, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k2}
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm13, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,1,2,3,4,15,u,u>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm5, %zmm19, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm12, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,1,2,3,4,5,15,u>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm11, %zmm13, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm12, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm7, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm22, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm24, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm21, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm8
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 1408(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 1344(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 1280(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 1216(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1152(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 1472(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 1408(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 1344(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 1280(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, 1088(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 960(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 896(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 832(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 768(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 704(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, 960(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 896(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 832(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 640(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 512(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 448(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 448(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 384(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 1728(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 1664(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 1600(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 1536(%rax)
-; AVX512DQ-SLOW-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, (%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 1728(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 1664(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 1600(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 1536(%rax)
+; AVX512DQ-SLOW-NEXT:    addq $2184, %rsp # imm = 0x888
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf32:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    subq $2056, %rsp # imm = 0x808
+; AVX512DQ-FAST-NEXT:    subq $2088, %rsp # imm = 0x828
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512DQ-FAST-NEXT:    movb $96, %r10b
 ; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1]
-; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm12
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r9), %ymm5
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%r9), %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r9), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%r9), %ymm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %ymm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %ymm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %ymm16
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%r8), %ymm15
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm7, %ymm17
 ; AVX512DQ-FAST-NEXT:    movb $28, %r10b
 ; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k2
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,3,7,7]
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm1, %ymm4, %ymm7
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7]
-; AVX512DQ-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm7, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,3,7,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm1, %ymm5, %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm1
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm17, %zmm20
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7]
 ; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2]
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm5, %ymm4, %ymm23
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2]
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm3, %ymm4, %ymm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm19, %zmm3
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm0, %ymm5, %ymm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm4, %ymm5, %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm31, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm19, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm14
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm18, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm7
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm7, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm17, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm5, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm12
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm8
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm31, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm18, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm30
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm3
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm6, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm3, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm25, %zmm4
+; AVX512DQ-FAST-NEXT:    movb $48, %r10b
+; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm12
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2]
 ; AVX512DQ-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm9, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm16, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    movb $48, %r10b
-; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k3
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm29
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm29
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm31
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm7
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm24
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm15, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm9, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm26, %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm17
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm8, %zmm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm23
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm9, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm15, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm18, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm18
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm13, %zmm5, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm13, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm28
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm21
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm26
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm11, %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm8, %zmm5
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm27, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm27, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm27
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm20, %zmm11
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm0, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm20, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    movb $14, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512DQ-FAST-NEXT:    movb $120, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3}
-; AVX512DQ-FAST-NEXT:    movb $-61, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k5}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm7, %zmm18, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm18, %zmm7, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm31, %zmm18
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm19, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm25, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k4}
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm19, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm25, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm29, %zmm23, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm19, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm27, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm20, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm19, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm19, %zmm10
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm17, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm8
 ; AVX512DQ-FAST-NEXT:    movb $24, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k3}
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k5}
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k5}
-; AVX512DQ-FAST-NEXT:    movb $-31, %sil
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm21, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    movb $14, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm28 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm27 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm27 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4}
+; AVX512DQ-FAST-NEXT:    movb $120, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k4}
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k4}
+; AVX512DQ-FAST-NEXT:    movb $-61, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm21 {%k4}
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k4}
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-FAST-NEXT:    movb $12, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4}
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm27 {%k3}
+; AVX512DQ-FAST-NEXT:    movb $-31, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm27 {%k3}
+; AVX512DQ-FAST-NEXT:    movb $112, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm3
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm3
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %xmm3
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4}
-; AVX512DQ-FAST-NEXT:    movb $112, %sil
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    movb $6, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4}
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm17
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm6
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    movb $6, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4}
-; AVX512DQ-FAST-NEXT:    movb $56, %cl
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3}
+; AVX512DQ-FAST-NEXT:    movb $56, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm19
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4}
+; AVX512DQ-FAST-NEXT:    movb $64, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm3, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm14 {%k1}
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm16, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm12, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%r8), %ymm12
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    movb $64, %al
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm1
 ; AVX512DQ-FAST-NEXT:    movb $8, %al
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm10, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,1,2,3,4,15,u,u>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm20, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm10, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm3, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm10
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm20 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%r8), %ymm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm3
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 1472(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 1408(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 1344(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1472(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 1408(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 1344(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1280(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 1216(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1152(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 1088(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 960(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 896(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 832(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 960(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 896(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 832(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 768(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 640(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 448(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 384(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 448(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 384(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, 192(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 1664(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 1536(%rax)
-; AVX512DQ-FAST-NEXT:    addq $2056, %rsp # imm = 0x808
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 1728(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 1664(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 1600(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 1536(%rax)
+; AVX512DQ-FAST-NEXT:    addq $2088, %rsp # imm = 0x828
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf32:
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
-; AVX512BW-ONLY-SLOW-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-SLOW-NEXT:    subq $2184, %rsp # imm = 0x888
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm29
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $96, %r10b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $28, %r10b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm22, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm17, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm16, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm10, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm22, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm17, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %ymm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%r8), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm0, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm20, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm20, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm22, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm20, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm24, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm19, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm22
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm21, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%r9), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm30[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm5, %zmm1, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm24, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm4, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm5, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm4, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm18, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm19, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm19, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm24
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %r10b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm6, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm6, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm22, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm22, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm7[0],zmm2[0],zmm7[2],zmm2[2],zmm7[4],zmm2[4],zmm7[6],zmm2[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm12, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm18, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm22, %zmm18, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm24, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm10, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm22, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm17 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm10, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm10, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm6 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm27 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm1, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm23 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm1, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $120, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm24 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-61, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    movb $-31, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm9 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm9 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $6, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    movb $56, %cl
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm15 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm18 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    movb $-31, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm15 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm18 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    movb $56, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm26 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm21 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm19 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    movb $64, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm19, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm21, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm12, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm22 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm6[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    movb $64, %cl
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm10 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm12 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm29 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $8, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm13, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,1,2,3,4,15,u,u>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm22, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm12, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm10, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,1,2,3,4,5,15,u>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm13, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm14, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm12, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm10, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm25, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm31, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm8, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 1472(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1408(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 1280(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1216(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1152(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 960(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 832(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 768(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 704(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 512(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm3, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 960(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 832(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 1344(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 896(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 640(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 448(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1728(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 1664(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 1600(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1536(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 896(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 448(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $2184, %rsp # imm = 0x888
 ; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf32:
 ; AVX512BW-ONLY-FAST:       # %bb.0:
-; AVX512BW-ONLY-FAST-NEXT:    subq $2024, %rsp # imm = 0x7E8
+; AVX512BW-ONLY-FAST-NEXT:    subq $2152, %rsp # imm = 0x868
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm30
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    movb $96, %r10b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm25, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%r9), %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %ymm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %ymm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %ymm22
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%r9), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%r8), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $28, %r10b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,7,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm1, %ymm3, %ymm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,3,7,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm1, %ymm5, %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm10, (%rsp) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm6, %ymm3, %ymm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm2, %ymm3, %ymm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm14, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm19, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm16, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm14
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm2, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm14, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm4, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm6, %ymm5, %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm4, %ymm5, %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm25, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm15, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm8, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm9, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm28, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm5, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm0, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm4, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    movb $48, %r10b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm12, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm11, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm3, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm25, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm21, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm21, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm28, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    movb $48, %r10b
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm31, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm4, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm10, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm28, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm4, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm21, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm31, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm4, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm4, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm21, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm15, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm28, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm24, %zmm0, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm24, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm31, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm26, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm26, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm19, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm19, %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm19, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm4, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm21, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm15, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm4, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm21, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm27, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm21, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm21, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm24, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm0, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm9, %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4}
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm3, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm9, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    movb $112, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm3, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm3, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm28 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm21 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm5, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm18 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm2, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $14, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    movb $120, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm6 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k5}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    movb $120, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k5}
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k5}
-; AVX512BW-ONLY-FAST-NEXT:    movb $-31, %sil
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm26 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $6, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    movb $56, %cl
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    movb $-31, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm30 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    movb $56, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm26 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm20 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    movb $64, %cl
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm22, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm9, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm11 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm12, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm9, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    movb $64, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    movb $8, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm19 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm6, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,15,u,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm22, %zmm19, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm9, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm8, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,15,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm14, %zmm6, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm24, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1472(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1408(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1280(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1216(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1152(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1088(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm1, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 960(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 832(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 768(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 704(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 640(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 384(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 192(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 896(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, 1344(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 896(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 448(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 1728(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1664(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1536(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    addq $2024, %rsp # imm = 0x7E8
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $2152, %rsp # imm = 0x868
 ; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512BW-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf32:
 ; AVX512DQBW-SLOW:       # %bb.0:
-; AVX512DQBW-SLOW-NEXT:    subq $2120, %rsp # imm = 0x848
+; AVX512DQBW-SLOW-NEXT:    subq $2184, %rsp # imm = 0x888
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm4
 ; AVX512DQBW-SLOW-NEXT:    movb $96, %r10b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1]
 ; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm9, (%rsp) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r9), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm8, (%rsp) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r9), %ymm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r8), %ymm11
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r8), %ymm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
 ; AVX512DQBW-SLOW-NEXT:    movb $28, %r10b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k2
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-SLOW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7]
 ; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-SLOW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
 ; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm18, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm17, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm19
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm16, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm8, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm18, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r9), %ymm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r8), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm3, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm7, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm16, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm25, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r9), %ymm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r8), %ymm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm5[0],ymm14[0],ymm5[2],ymm14[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm5[2,3,2,3],zmm15[2,3,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm0, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm4, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm3, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm17, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm12, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm3, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm3, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm22
 ; AVX512DQBW-SLOW-NEXT:    movb $48, %r10b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k3
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm4, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm7, %zmm8, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm25, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm17, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm31, %zmm24, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm31, %zmm24, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm2, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm19, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm19, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
-; AVX512DQBW-SLOW-NEXT:    movb $120, %sil
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm11, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm22, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm3, %zmm22
+; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm4, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    movb $120, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm11 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm3 = zmm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25 {%k4}
 ; AVX512DQBW-SLOW-NEXT:    movb $-61, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm23 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm26 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm30[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k4}
-; AVX512DQBW-SLOW-NEXT:    movb $-31, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm21 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm9 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm9 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQBW-SLOW-NEXT:    movb $12, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4}
-; AVX512DQBW-SLOW-NEXT:    movb $112, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k3}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm5, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k3}
+; AVX512DQBW-SLOW-NEXT:    movb $-31, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k3}
+; AVX512DQBW-SLOW-NEXT:    movb $112, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm2, %zmm27 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
-; AVX512DQBW-SLOW-NEXT:    movb $56, %cl
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm19, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3}
+; AVX512DQBW-SLOW-NEXT:    movb $56, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4}
+; AVX512DQBW-SLOW-NEXT:    movb $64, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm5, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm13 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm23 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm16, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm22, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %ymm16
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm19 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm24 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm14 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%r8), %ymm6
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    movb $64, %cl
-; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    movb $8, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm13, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,1,2,3,4,15,u,u>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm5, %zmm19, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm12, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm7, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,1,2,3,4,5,15,u>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm11, %zmm13, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm12, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm7, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm22, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm24, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm21, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 1472(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 1408(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 1344(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 1280(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 1216(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1152(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, 1088(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 960(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 896(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 832(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 768(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 704(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, 960(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 896(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 832(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 640(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 512(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 448(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 448(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 384(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 1728(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 1664(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 1600(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 1536(%rax)
-; AVX512DQBW-SLOW-NEXT:    addq $2120, %rsp # imm = 0x848
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, (%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $2184, %rsp # imm = 0x888
 ; AVX512DQBW-SLOW-NEXT:    vzeroupper
 ; AVX512DQBW-SLOW-NEXT:    retq
 ;
 ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf32:
 ; AVX512DQBW-FAST:       # %bb.0:
-; AVX512DQBW-FAST-NEXT:    subq $2056, %rsp # imm = 0x808
+; AVX512DQBW-FAST-NEXT:    subq $2088, %rsp # imm = 0x828
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm5
 ; AVX512DQBW-FAST-NEXT:    movb $96, %r10b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1]
-; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm12
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-FAST-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%r9), %ymm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%r9), %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%r9), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%r9), %ymm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %ymm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %ymm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %ymm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%r8), %ymm15
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm7, %ymm17
 ; AVX512DQBW-FAST-NEXT:    movb $28, %r10b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k2
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,3,7,7]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm1, %ymm4, %ymm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7]
-; AVX512DQBW-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm7, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,3,7,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm1, %ymm5, %ymm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm1
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-FAST-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm17, %zmm20
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7]
 ; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm5, %ymm4, %ymm23
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm3, %ymm4, %ymm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm19, %zmm3
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm0, %ymm5, %ymm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm4, %ymm5, %ymm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm31, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm19, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm14
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm18, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm7
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm7, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm24, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm17, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm5, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm12
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm7, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm31, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm18, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm30
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm24
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm6, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm3, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-FAST-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm25, %zmm4
+; AVX512DQBW-FAST-NEXT:    movb $48, %r10b
+; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm12
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2]
 ; AVX512DQBW-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm9, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm16, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    movb $48, %r10b
-; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k3
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm29
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm29
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm31
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm31
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm7
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm4, %zmm24
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm15, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm9, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm26, %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm17
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm8, %zmm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm23
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm9, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm15, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm18, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm13, %zmm5, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm13, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm21
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm26
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm11, %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm8, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm27, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm27, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm27
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm20, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm0, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm20, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    movb $14, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512DQBW-FAST-NEXT:    movb $120, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm16, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3}
-; AVX512DQBW-FAST-NEXT:    movb $-61, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k5}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm7, %zmm18, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm18, %zmm7, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm31, %zmm18
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm17
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm19, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm25, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k4}
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm19, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm25, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm13, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm29, %zmm23, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm19, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm27, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm20, %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm19, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm19, %zmm10
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm17, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm8
 ; AVX512DQBW-FAST-NEXT:    movb $24, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k3}
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k5}
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k5}
-; AVX512DQBW-FAST-NEXT:    movb $-31, %sil
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm21, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    movb $14, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm28 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm27 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm27 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4}
+; AVX512DQBW-FAST-NEXT:    movb $120, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k4}
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k4}
+; AVX512DQBW-FAST-NEXT:    movb $-61, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm21 {%k4}
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13 {%k4}
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQBW-FAST-NEXT:    movb $12, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4}
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm27 {%k3}
+; AVX512DQBW-FAST-NEXT:    movb $-31, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm27 {%k3}
+; AVX512DQBW-FAST-NEXT:    movb $112, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm3
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
 ; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm3
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
 ; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %xmm3
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
 ; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4}
-; AVX512DQBW-FAST-NEXT:    movb $112, %sil
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    movb $6, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4}
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm17
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    movb $6, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4}
-; AVX512DQBW-FAST-NEXT:    movb $56, %cl
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3}
+; AVX512DQBW-FAST-NEXT:    movb $56, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4}
+; AVX512DQBW-FAST-NEXT:    movb $64, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm3, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm14 {%k1}
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm16, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm12, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%r8), %ymm12
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    movb $64, %al
-; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm1
 ; AVX512DQBW-FAST-NEXT:    movb $8, %al
-; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm10, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,1,2,3,4,15,u,u>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm20, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm10, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm3, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm10
+; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm20 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%r8), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm3
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 1472(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 1408(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 1344(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1472(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 1408(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 1344(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1280(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 1216(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1152(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 1088(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 960(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 896(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 832(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 960(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 896(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 832(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 768(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 640(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 448(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 448(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 384(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, 192(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 1664(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 1536(%rax)
-; AVX512DQBW-FAST-NEXT:    addq $2056, %rsp # imm = 0x808
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, (%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 1728(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 1664(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 1600(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, 1536(%rax)
+; AVX512DQBW-FAST-NEXT:    addq $2088, %rsp # imm = 0x828
 ; AVX512DQBW-FAST-NEXT:    vzeroupper
 ; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
@@ -12615,766 +12547,760 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf64:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
-; AVX512F-ONLY-SLOW-NEXT:    subq $6600, %rsp # imm = 0x19C8
+; AVX512F-ONLY-SLOW-NEXT:    subq $6248, %rsp # imm = 0x1868
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm18
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $96, %r10b
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%r8), %ymm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $28, %r10b
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm28, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm31, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm24, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm10, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm28, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%r9), %ymm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%r9), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm16, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm30, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%r9), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm7, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%r9), %ymm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm24, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm7, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm30, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm28, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm17, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm31, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%r9), %ymm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%r8), %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm18, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm11, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%r9), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%r8), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm28, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm30, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm28, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %ymm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%r8), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm11, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm7, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm31, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%r9), %ymm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%r8), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    movb $24, %r10b
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm7, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm25, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm5, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm7, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm10, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm28, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm11, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    movb $48, %r10b
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %r10d, %k4
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm11, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm8, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm8, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm31
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm10, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm8, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm15
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm8, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm9, %zmm0, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm0, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    movb $4, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm12, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm5, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm28, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm28, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm28, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm15, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm17
 ; AVX512F-ONLY-SLOW-NEXT:    movb $6, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm1, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm9 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    movb $64, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm1, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,9,u,u,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    movb $64, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    movb $4, %sil
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
 ; AVX512F-ONLY-SLOW-NEXT:    movb $12, %sil
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm2, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm12, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm3, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%r9), %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%r8), %ymm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%r9), %ymm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%r8), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2]
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm18
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm12, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    movb $8, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k5}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k3}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-31, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm0, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm1, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm1, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 256(%rax), %zmm1, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 320(%rax), %zmm1, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 384(%rax), %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 256(%rax), %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 320(%rax), %zmm0, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $3, 384(%rax), %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 264(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 328(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 392(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $56, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm22 {%k2}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm20 {%k2}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
@@ -13385,64 +13311,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $120, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm17 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm15 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm25 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm29 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm12 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-61, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm12 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
@@ -13450,22 +13377,29 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
@@ -13480,88 +13414,81 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 3008(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 2944(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 2880(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm7, 2816(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2752(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 2688(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 2624(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 2944(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2880(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 2752(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 2688(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 2560(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 2496(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2432(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 2496(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 2432(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm6, 2368(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 2304(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 2240(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 2176(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 2112(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 2048(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1984(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1920(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 2304(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 2240(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 2176(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 2112(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1984(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm5, 1920(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 1856(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1792(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1792(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1664(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1600(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1536(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm3, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1472(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1216(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 1152(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 1024(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm3, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 768(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 704(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 640(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm1, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 768(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm2, 576(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 256(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3520(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 3520(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -13572,820 +13499,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 3072(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3072(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    addq $6600, %rsp # imm = 0x19C8
+; AVX512F-ONLY-SLOW-NEXT:    addq $6248, %rsp # imm = 0x1868
 ; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf64:
 ; AVX512F-ONLY-FAST:       # %bb.0:
-; AVX512F-ONLY-FAST-NEXT:    subq $6696, %rsp # imm = 0x1A28
+; AVX512F-ONLY-FAST-NEXT:    subq $6120, %rsp # imm = 0x17E8
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    movb $96, %r10b
-; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    movb $96, %r10b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3]
-; AVX512F-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm3, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %ymm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
 ; AVX512F-ONLY-FAST-NEXT:    movb $28, %r10b
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k2
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm15, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7]
-; AVX512F-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm10, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6]
+; AVX512F-ONLY-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
+; AVX512F-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm18, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm22, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm23, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%r9), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%r8), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm25
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%r8), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm12, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm27, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm18, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm8, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm10, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm14, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm27, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm18, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%r9), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm5, %ymm24
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm15, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm16, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm12, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm28, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%r9), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %ymm22
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm23, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm29, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%r9), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%r8), %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2]
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm16[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm22, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm18, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm29, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm30, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%r9), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%r8), %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm7, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%r9), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%r8), %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm14[0],ymm6[0],ymm14[2],ymm6[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm14, %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm3, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm7, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm22, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm15, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm23, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm18, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,7,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm17, %ymm0, %ymm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm9, %ymm0, %ymm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm4, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%r9), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%r8), %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm1, %ymm0, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4]
-; AVX512F-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm10, %ymm0, %ymm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm6, %ymm0, %ymm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%r9), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%r8), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %ymm2, %ymm0, %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm23, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    movb $24, %r10b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
-; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm30, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    movb $48, %r10b
-; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k3
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1]
-; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm10, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm20, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm29, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7]
-; AVX512F-ONLY-FAST-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm25, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm5, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm30, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm23, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    movb $48, %r10b
+; AVX512F-ONLY-FAST-NEXT:    kmovw %r10d, %k4
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm4, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm30, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm15, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
+; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm25, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm27, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm30, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm10, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm5, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm23, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm4, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm25, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm27, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm5, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm15, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm28
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm28
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm29, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm25, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm23, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm27, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm12, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm30, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm15, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm4, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm10, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm29, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm25, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm30, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm27, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm23, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm7, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm25, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm20, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm5, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    movb $4, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm14, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm5, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm6, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm23, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm4, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm20, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm31, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm10, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm14, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm14, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm15 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm15
 ; AVX512F-ONLY-FAST-NEXT:    movb $6, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k5
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    movb $64, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,15,u,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
-; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm10, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    movb $4, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14>
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm0, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
+; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5}
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5}
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5}
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 264(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 328(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5}
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 392(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm16, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm19, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm5, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm7, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm3, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm8, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm11, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    movb $8, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm31 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k4}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k3}
 ; AVX512F-ONLY-FAST-NEXT:    movb $-31, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %xmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm0, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    movb $112, %cl
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 256(%rax), %zmm17, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 320(%rax), %zmm26, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 384(%rax), %zmm20, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    movb $56, %cl
-; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k2
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 256(%rax), %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 320(%rax), %zmm0, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $3, 384(%rax), %zmm20, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    movb $56, %cl
+; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm23 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $14, %cl
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2}
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -14393,1089 +14316,1093 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    movb $120, %al
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm26 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    movb $-61, %al
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 3008(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 2944(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 2880(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm12, 2816(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 2752(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 2688(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm12, 2624(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 2560(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 3008(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 2944(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 2880(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 2816(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 2752(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 2688(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 2624(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 2560(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2496(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 2432(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm12, 2368(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 2304(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 2240(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 2432(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 2368(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 2304(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 2240(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2176(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 2112(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 2048(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 1984(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm10, 1920(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1856(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 1984(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 1920(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 1856(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1792(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 1664(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1600(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 1536(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm9, 1472(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 1408(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1216(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1152(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 1088(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm7, 1024(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 768(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 704(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm5, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 576(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 256(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm3, 128(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 3520(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 3520(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, 3328(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3072(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 3072(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512F-ONLY-FAST-NEXT:    addq $6696, %rsp # imm = 0x1A28
+; AVX512F-ONLY-FAST-NEXT:    addq $6120, %rsp # imm = 0x17E8
 ; AVX512F-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512F-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf64:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    subq $6472, %rsp # imm = 0x1948
+; AVX512DQ-SLOW-NEXT:    subq $6280, %rsp # imm = 0x1888
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    movb $96, %r10b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r9), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQ-SLOW-NEXT:    movb $28, %r10b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k2
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5]
+; AVX512DQ-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
 ; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6]
-; AVX512DQ-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm1
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7]
-; AVX512DQ-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm17, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm27, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm24, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm17, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm27, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm31, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r9), %ymm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm11, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm8, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm5, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm17, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm27, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%r9), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r9), %ymm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%r8), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm10, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm27, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%r9), %ymm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%r8), %ymm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm8, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm28, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm24, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%r9), %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%r8), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm28, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm15
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm5, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm24, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm9, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%r9), %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%r8), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm27, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%r9), %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm28, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm17, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm25, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm31, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%r9), %ymm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%r8), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm30, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm17, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm25, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm7
+; AVX512DQ-SLOW-NEXT:    movb $24, %r10b
+; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k3}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm9, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm25, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm25, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm5, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm31, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm6, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm9, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm10
+; AVX512DQ-SLOW-NEXT:    movb $48, %r10b
+; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k4
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm9, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm9, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm9, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm16
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm16
-; AVX512DQ-SLOW-NEXT:    movb $48, %r10b
-; AVX512DQ-SLOW-NEXT:    kmovw %r10d, %k3
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
-; AVX512DQ-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k4} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm29
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm8, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm31
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm26
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm23
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm30
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm30
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm9, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm8, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm26
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm25
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm25
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm28
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm9, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm25, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm15
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm22
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm21
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm8, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm12 {%k1}
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm18, %zmm16, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm18, %zmm16, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm18, %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm15, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    movb $6, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,9,u,u,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
+; AVX512DQ-SLOW-NEXT:    movb $64, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k5}
 ; AVX512DQ-SLOW-NEXT:    movb $4, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm12, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm3, %zmm5
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    movb $12, %sil
+; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm10, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%r9), %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%r8), %ymm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    movb $24, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm18 {%k5}
-; AVX512DQ-SLOW-NEXT:    movb $6, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm12
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm1, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm6 {%k5}
-; AVX512DQ-SLOW-NEXT:    movb $64, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm1, %zmm17
-; AVX512DQ-SLOW-NEXT:    movb $12, %sil
-; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
-; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm4
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4}
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm2, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm2, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm12, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm0, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm5, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%r9), %ymm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%r8), %ymm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm4, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7]
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm6
 ; AVX512DQ-SLOW-NEXT:    movb $8, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k5}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k5}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k5}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k5}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k3}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k3}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k3}
 ; AVX512DQ-SLOW-NEXT:    movb $-31, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5}
 ; AVX512DQ-SLOW-NEXT:    movb $112, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k2
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2}
-; AVX512DQ-SLOW-NEXT:    vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 264(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 328(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3}
-; AVX512DQ-SLOW-NEXT:    vpbroadcastq 392(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 264(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 328(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4}
+; AVX512DQ-SLOW-NEXT:    vpbroadcastq 392(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
 ; AVX512DQ-SLOW-NEXT:    movb $56, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm20 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm22 {%k2}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    movb $120, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm12 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
 ; AVX512DQ-SLOW-NEXT:    movb $-61, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # zmm7 = zmm9[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512DQ-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQ-SLOW-NEXT:    kmovw %ecx, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 3008(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 2944(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, 2880(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 3008(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 2944(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 2880(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 2752(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 2688(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 2624(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 2560(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, 2496(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, 2752(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 2688(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 2624(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 2560(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 2496(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 2432(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 2304(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 2240(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 2176(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm6, 2368(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 2304(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 2240(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 2176(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 2112(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 2048(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 1984(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1920(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 1792(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm5, 1920(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 1856(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1792(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, 1728(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 1664(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 1600(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 1600(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 1536(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 1472(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 1216(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1152(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm3, 1024(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 768(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 768(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 640(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm2, 576(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 256(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 3520(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 3520(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -15486,815 +15413,809 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 3072(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3072(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512DQ-SLOW-NEXT:    addq $6472, %rsp # imm = 0x1948
+; AVX512DQ-SLOW-NEXT:    addq $6280, %rsp # imm = 0x1888
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf64:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    subq $6568, %rsp # imm = 0x19A8
+; AVX512DQ-FAST-NEXT:    subq $6120, %rsp # imm = 0x17E8
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm11
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3]
-; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
+; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    movb $96, %r10b
 ; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm22
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
+; AVX512DQ-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
 ; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm4, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r9), %ymm13
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r9), %ymm9
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %ymm30
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQ-FAST-NEXT:    movb $28, %r10b
 ; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k2
 ; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm20
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5]
 ; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7]
-; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7]
+; AVX512DQ-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm8, %zmm1
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7]
-; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
+; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm2
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7]
+; AVX512DQ-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
+; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm20, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm21, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm23, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm27
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa 128(%r9), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %ymm24
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm16, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm17, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm21, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%r8), %ymm5
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm5, %ymm30
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm15, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm20, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm19, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm18, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm10, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm8, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %ymm25
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%r8), %ymm11
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %ymm24
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%r8), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm24[0],ymm9[2],ymm24[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm9, %ymm25
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm4, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm17, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm15, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm20, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm19, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm21, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm18, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm8, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm31
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm7, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rax), %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa 256(%r9), %ymm9
-; AVX512DQ-FAST-NEXT:    vmovdqa 256(%r8), %ymm5
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r8), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r9), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm18, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm4, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm8, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm22, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rax), %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%r9), %ymm4
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%r8), %ymm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%r9), %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%r8), %ymm11
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r8), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r9), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm9, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm20, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm18, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rax), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%r9), %ymm9
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%r8), %ymm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2]
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r8), %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r9), %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm17, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r9), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm19, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm8, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm23, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm18, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,7,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm25, %ymm0, %ymm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm24, %ymm0, %ymm25
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm15, %ymm0, %ymm11
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm9, %ymm0, %ymm5
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm4, %ymm0, %ymm2
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 384(%r9), %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 384(%r8), %ymm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %ymm1, %ymm0, %ymm2
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4]
-; AVX512DQ-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5]
-; AVX512DQ-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm21, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm9, %ymm0, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%r9), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%r8), %ymm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    movb $48, %r10b
+; AVX512DQ-FAST-NEXT:    vpermt2q %ymm4, %ymm0, %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm30
+; AVX512DQ-FAST-NEXT:    movb $24, %r10b
 ; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k3
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1]
-; AVX512DQ-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k3}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm11, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2]
-; AVX512DQ-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm21, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm30, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [15,7,15,7]
-; AVX512DQ-FAST-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm29, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm21, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm18, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm18, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm18, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm18, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4]
+; AVX512DQ-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm13
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm18, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm23, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm21, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm11, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm30, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm18, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm21, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm17
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm30, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm18, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm8, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    movb $48, %r10b
+; AVX512DQ-FAST-NEXT:    kmovw %r10d, %k4
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512DQ-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm11, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm21, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm30, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm18, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm21, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm3, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm8, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm11, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm7, %zmm28
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
+; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm8, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm18, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm29, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm16
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm21, %zmm16
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm8, %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm8, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm4, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm29
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm11
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm23, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm30, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm25
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm7, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm21
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm6
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm12, %zmm11
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm4, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm18, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r8), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm9, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm4, %zmm14
-; AVX512DQ-FAST-NEXT:    movb $4, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm13, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm10, %zmm18
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm23, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm28
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm23, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm27
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm26
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm23, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm27
-; AVX512DQ-FAST-NEXT:    movb $24, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm25
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm23, %zmm31
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm15, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm19, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm15, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm9, %zmm14, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm9, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm23
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm13, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm16 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm16
 ; AVX512DQ-FAST-NEXT:    movb $6, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k5
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm0
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm12 {%k4}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512DQ-FAST-NEXT:    movb $64, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r8), %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r9), %zmm2
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k4}
+; AVX512DQ-FAST-NEXT:    movb $4, %sil
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm16
 ; AVX512DQ-FAST-NEXT:    movb $12, %sil
-; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
-; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdx), %xmm5
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3}
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm11, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm5, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,5,15,u>
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm1, %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14>
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm5, %zmm12
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 264(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 328(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5}
-; AVX512DQ-FAST-NEXT:    vpbroadcastq 392(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rax), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm18, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rax), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm18
+; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdx), %xmm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,9,u,6,7>
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm15
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm14
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 264(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 328(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5}
+; AVX512DQ-FAST-NEXT:    vpbroadcastq 392(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rax), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm3, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm16
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm11, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm10, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm5, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm7, %zmm12
 ; AVX512DQ-FAST-NEXT:    movb $8, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm21 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k3}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k4}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm26 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k3}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
 ; AVX512DQ-FAST-NEXT:    movb $-31, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdx), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4}
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdx), %xmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4}
 ; AVX512DQ-FAST-NEXT:    movb $112, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm27
-; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2}
+; AVX512DQ-FAST-NEXT:    vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2}
 ; AVX512DQ-FAST-NEXT:    movb $56, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm24 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm20 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FAST-NEXT:    movb $14, %cl
 ; AVX512DQ-FAST-NEXT:    kmovw %ecx, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2}
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -16302,920 +16223,913 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512DQ-FAST-NEXT:    movb $120, %al
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
 ; AVX512DQ-FAST-NEXT:    movb $-61, %al
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm17 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 3008(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 2944(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 2880(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 2816(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 2752(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 2688(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 2624(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 2560(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 3008(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 2944(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 2880(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm8, 2816(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 2752(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 2688(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, 2624(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 2560(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 2496(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 2432(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 2368(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 2304(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 2240(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 2432(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm8, 2368(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 2304(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 2240(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2176(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 2112(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 2112(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 2048(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 1984(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 1920(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 1856(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 1984(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm8, 1920(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1856(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 1792(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 1664(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 1600(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 1536(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 1472(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1664(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 1600(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm8, 1472(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 1216(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 1152(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, 1088(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 1024(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm5, 1024(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 768(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 704(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm2, 576(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 256(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 3520(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 3520(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 3328(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 3072(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, 3072(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512DQ-FAST-NEXT:    addq $6568, %rsp # imm = 0x19A8
+; AVX512DQ-FAST-NEXT:    addq $6120, %rsp # imm = 0x17E8
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf64:
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
-; AVX512BW-ONLY-SLOW-NEXT:    subq $6600, %rsp # imm = 0x19C8
+; AVX512BW-ONLY-SLOW-NEXT:    subq $6248, %rsp # imm = 0x1868
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm18
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $96, %r10b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm10, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm31, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%r9), %ymm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%r8), %ymm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $28, %r10b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm28, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm31, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm24, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm10, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm28, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%r9), %ymm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm7, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%r9), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm16, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm11, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm30, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%r9), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm7, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%r9), %ymm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm24, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm7, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm15
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm30, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm28, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm17, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm31, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%r9), %ymm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%r8), %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm18, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm11, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%r9), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%r8), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm28, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm30, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm28, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %ymm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%r8), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm11, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm7, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm31, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%r9), %ymm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%r8), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %r10b
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm7, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm25, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm24, %zmm5, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm7, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm10, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm28, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm11, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $48, %r10b
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r10d, %k4
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm11, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm8, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm27
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm8, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm31
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm10, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm8, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm15
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm8, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm9, %zmm0, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm0, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    movb $4, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm12, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm5, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm28, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm28, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm19, %zmm28, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm15, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm17
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $6, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm1, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm9 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    movb $64, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm3 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm1, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,9,u,u,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    movb $64, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    movb $4, %sil
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %sil
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm2, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm12, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm7, %zmm3, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm8, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%r9), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%r8), %ymm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%r9), %ymm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%r8), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm18
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm2, %zmm12, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $8, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k5}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k3}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-31, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, (%rax), %zmm0, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $112, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 64(%rax), %zmm1, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 128(%rax), %zmm1, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 192(%rax), %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 256(%rax), %zmm1, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 320(%rax), %zmm1, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 384(%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 256(%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 320(%rax), %zmm0, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $3, 384(%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 264(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 328(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 392(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $56, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm22 {%k2}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm20 {%k2}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
@@ -17226,64 +17140,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $120, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm17 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm15 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm25 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm29 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm12 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-61, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm12 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
@@ -17291,22 +17206,29 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $14, %cl
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
@@ -17321,88 +17243,81 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 3008(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 2944(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 2880(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm7, 2816(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2752(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 2688(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 2624(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 2944(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2880(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, 2752(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 2688(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 2560(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 2496(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2432(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 2496(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 2432(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm6, 2368(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 2304(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 2240(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 2176(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 2112(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 2048(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1984(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1920(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 2304(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 2240(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 2176(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 2112(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1984(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm5, 1920(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 1856(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1792(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1792(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1664(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1600(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1536(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm3, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1472(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1216(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 1152(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 1024(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm3, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 768(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 704(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 640(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm1, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 768(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm2, 576(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 256(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3520(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 3520(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -17413,820 +17328,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 3072(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3072(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    addq $6600, %rsp # imm = 0x19C8
+; AVX512BW-ONLY-SLOW-NEXT:    addq $6248, %rsp # imm = 0x1868
 ; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf64:
 ; AVX512BW-ONLY-FAST:       # %bb.0:
-; AVX512BW-ONLY-FAST-NEXT:    subq $6696, %rsp # imm = 0x1A28
+; AVX512BW-ONLY-FAST-NEXT:    subq $6120, %rsp # imm = 0x17E8
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $96, %r10b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm3, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %ymm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%r9), %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%r8), %ymm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $28, %r10b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm15, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm10, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm18, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm22, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm23, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%r9), %ymm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%r8), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm25
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%r8), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm12, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm27, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm7, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm18, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm8, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm10, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm14, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm27, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm18, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%r9), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %ymm5, %ymm24
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm15, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm16, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm12, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm28, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%r9), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %ymm22
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm23, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm14
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm29, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%r9), %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%r8), %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2]
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm16[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm22, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm18, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm29, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm30, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%r9), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%r8), %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm7, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%r9), %ymm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%r8), %ymm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm14[0],ymm6[0],ymm14[2],ymm6[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %ymm14, %ymm31
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm3, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm8, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm7, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm15, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm22, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm23, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm18, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,7,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm17, %ymm0, %ymm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm9, %ymm0, %ymm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm4, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%r9), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%r8), %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm1, %ymm0, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm30, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    movb $48, %r10b
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm10, %ymm0, %ymm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm6, %ymm0, %ymm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%r9), %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%r8), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %ymm2, %ymm0, %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm23, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    movb $24, %r10b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm10, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm20, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm20, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm29, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm25, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm5, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm30, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm23, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    movb $48, %r10b
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %r10d, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm10, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm4, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm30, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm15, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm25, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm27, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm30, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm10, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm5, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm1, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm23, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm4, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm25, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm27, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm5, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm15, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm28
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm28
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm29, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm25, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm23, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm27, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm12, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm30, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm15, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm4, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm10, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm29, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm25, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm30, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm27, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm23, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm12
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm7, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm25, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm11, %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm25, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm20, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm5, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    movb $4, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm14, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm5, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm6, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm23, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm4, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm20, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm10, %zmm31, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm10, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm14, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm18, %zmm14, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm4, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm15 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm15
 ; AVX512BW-ONLY-FAST-NEXT:    movb $6, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k5
-; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm7, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    movb $64, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,15,u,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm10, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    movb $4, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14>
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm25, %zmm0, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5}
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5}
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5}
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 264(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 328(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5}
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 392(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm16, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm19, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm16 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm5, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm7, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm3, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm8, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm11, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    movb $8, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm31 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k4}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm13 {%k3}
 ; AVX512BW-ONLY-FAST-NEXT:    movb $-31, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %xmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k4}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, (%rax), %zmm0, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    movb $112, %cl
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 64(%rax), %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 256(%rax), %zmm17, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 320(%rax), %zmm26, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 384(%rax), %zmm20, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    movb $56, %cl
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 128(%rax), %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 192(%rax), %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 256(%rax), %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm29 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 320(%rax), %zmm0, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $3, 384(%rax), %zmm20, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    movb $56, %cl
+; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm23 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm25 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $14, %cl
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2}
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -18234,1089 +18145,1093 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    movb $120, %al
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm16 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm30 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm26 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    movb $-61, %al
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 3008(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 2944(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 2880(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm12, 2816(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 2752(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 2688(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm12, 2624(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 2560(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 3008(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 2944(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 2880(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 2816(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 2752(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 2688(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 2624(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 2560(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2496(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 2432(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm12, 2368(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 2304(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 2240(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 2432(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 2368(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 2304(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 2240(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2176(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 2112(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 2048(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 1984(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm10, 1920(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1856(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 1984(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 1920(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 1856(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1792(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 1664(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1600(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 1536(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm9, 1472(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 1408(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1216(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1152(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 1088(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm7, 1024(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 768(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 704(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 640(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm5, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 576(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 256(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm3, 128(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 3520(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 3520(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, 3328(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3072(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 3072(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    addq $6696, %rsp # imm = 0x1A28
+; AVX512BW-ONLY-FAST-NEXT:    addq $6120, %rsp # imm = 0x17E8
 ; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512BW-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf64:
 ; AVX512DQBW-SLOW:       # %bb.0:
-; AVX512DQBW-SLOW-NEXT:    subq $6472, %rsp # imm = 0x1948
+; AVX512DQBW-SLOW-NEXT:    subq $6280, %rsp # imm = 0x1888
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm13
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, (%rsp) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-SLOW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    movb $96, %r10b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r9), %ymm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r9), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%r8), %ymm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQBW-SLOW-NEXT:    movb $28, %r10b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k2
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm28, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
 ; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-SLOW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7]
-; AVX512DQBW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm17, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm27, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-SLOW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm24, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-SLOW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm17, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm10, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm27, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm31, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r9), %ymm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm11, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm9, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm8, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm24, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm5, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm17, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm27, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%r9), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r9), %ymm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%r8), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%r8), %ymm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm10, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm27, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%r9), %ymm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%r8), %ymm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm8, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm31, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm28, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm16, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm24, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%r9), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%r8), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm28, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm15
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm5, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm24, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm25, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm9, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%r9), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%r8), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm27, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%r9), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm28, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm17, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm25, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm31, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%r9), %ymm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%r8), %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm30, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm16, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm17, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm25, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm7
+; AVX512DQBW-SLOW-NEXT:    movb $24, %r10b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm17, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm9, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm25, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm9, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm25, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm25, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm5, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm31, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm6, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm9, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm10
+; AVX512DQBW-SLOW-NEXT:    movb $48, %r10b
+; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k4
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm9, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm9, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm9, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm20, %zmm3, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm16
-; AVX512DQBW-SLOW-NEXT:    movb $48, %r10b
-; AVX512DQBW-SLOW-NEXT:    kmovd %r10d, %k3
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7]
-; AVX512DQBW-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k4} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm29
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm8, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm26
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm9, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm8, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm9, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm9, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm25, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm15
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm17, %zmm21, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm8, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm12 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm18, %zmm16, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm18, %zmm16, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm18, %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm15, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    movb $6, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,9,u,u,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
+; AVX512DQBW-SLOW-NEXT:    movb $64, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k5}
 ; AVX512DQBW-SLOW-NEXT:    movb $4, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm12 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm12, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm3, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    movb $12, %sil
+; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm10, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%r9), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%r8), %ymm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm18 {%k5}
-; AVX512DQBW-SLOW-NEXT:    movb $6, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm12
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm1, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm6 {%k5}
-; AVX512DQBW-SLOW-NEXT:    movb $64, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm4, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm1, %zmm17
-; AVX512DQBW-SLOW-NEXT:    movb $12, %sil
-; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdx), %xmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm2, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm2, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm12, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14>
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm0, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm7, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm5, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%r9), %ymm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%r8), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm4, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7]
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    movb $8, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k5}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k5}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k5}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k5}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k3}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k3}
 ; AVX512DQBW-SLOW-NEXT:    movb $-31, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm24 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdx), %xmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5}
 ; AVX512DQBW-SLOW-NEXT:    movb $112, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 136(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 264(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 328(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 392(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 200(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 264(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 328(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4}
+; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 392(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4}
 ; AVX512DQBW-SLOW-NEXT:    movb $56, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm20 {%k2}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm22 {%k2}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    movb $120, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm19 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm29 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm22 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm12 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    movb $-61, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 64-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # zmm7 = zmm9[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    movb $14, %cl
 ; AVX512DQBW-SLOW-NEXT:    kmovd %ecx, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 3008(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 2944(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, 2880(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 3008(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 2944(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 2880(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 2752(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 2688(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 2624(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 2560(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, 2496(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, 2752(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 2688(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 2624(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 2560(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 2496(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 2432(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 2304(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 2240(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 2176(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm6, 2368(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 2304(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 2240(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 2176(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 2112(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 2048(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 1984(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1920(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 1792(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm5, 1920(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 1856(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1792(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, 1728(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 1664(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 1600(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 1472(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 1216(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1152(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm3, 1024(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 768(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 768(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 704(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 640(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm2, 576(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 256(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 3520(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 3520(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -19327,815 +19242,809 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 3072(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3072(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512DQBW-SLOW-NEXT:    addq $6472, %rsp # imm = 0x1948
+; AVX512DQBW-SLOW-NEXT:    addq $6280, %rsp # imm = 0x1888
 ; AVX512DQBW-SLOW-NEXT:    vzeroupper
 ; AVX512DQBW-SLOW-NEXT:    retq
 ;
 ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf64:
 ; AVX512DQBW-FAST:       # %bb.0:
-; AVX512DQBW-FAST-NEXT:    subq $6568, %rsp # imm = 0x19A8
+; AVX512DQBW-FAST-NEXT:    subq $6120, %rsp # imm = 0x17E8
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm11
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3]
-; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3]
-; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3]
+; AVX512DQBW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3]
+; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    movb $96, %r10b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm22
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm5
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7]
+; AVX512DQBW-FAST-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1]
 ; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3]
 ; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm4, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%r9), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%r9), %ymm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%r9), %ymm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%r9), %ymm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%r8), %ymm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %ymm30
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2]
 ; AVX512DQBW-FAST-NEXT:    movb $28, %r10b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k2
 ; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm20
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6]
-; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5]
 ; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7]
+; AVX512DQBW-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm8, %zmm1
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7]
-; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6]
+; AVX512DQBW-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm2
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7]
+; AVX512DQBW-FAST-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7]
+; AVX512DQBW-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm16, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm20, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm21, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm19, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm23, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm27
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%r9), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %ymm24
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm16, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm17, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm21, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%r8), %ymm5
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm5, %ymm30
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm14, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm10, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm7, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm15, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm20, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm19, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm18, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm10, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm8, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %ymm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%r8), %ymm11
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm4, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %ymm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%r8), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm24[0],ymm9[2],ymm24[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm9, %ymm25
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm17, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm15, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm20, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm19, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm23, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm21, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm18, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm8, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm31
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm7, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm8, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rax), %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%r9), %ymm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%r8), %ymm5
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r8), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r9), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm20, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm18, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm4, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm8, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm22, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm7, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rax), %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%r9), %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%r8), %ymm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%r9), %ymm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%r8), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r8), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r9), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm9, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm20, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm23, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm18, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rax), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%r9), %ymm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%r8), %ymm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2]
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r8), %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r9), %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm17, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r9), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm19, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm8, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm23, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm18, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,7,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm25, %ymm0, %ymm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm24, %ymm0, %ymm25
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm15, %ymm0, %ymm11
 ; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm9, %ymm0, %ymm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm4, %ymm0, %ymm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%r9), %ymm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%r8), %ymm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm1, %ymm0, %ymm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4]
-; AVX512DQBW-FAST-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5]
-; AVX512DQBW-FAST-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm21, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm9, %ymm0, %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%r9), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%r8), %ymm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    movb $48, %r10b
+; AVX512DQBW-FAST-NEXT:    vpermt2q %ymm4, %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm23, %zmm30
+; AVX512DQBW-FAST-NEXT:    movb $24, %r10b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k3
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1]
-; AVX512DQBW-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k3}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm11, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2]
-; AVX512DQBW-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm21, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm30, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm18 = [15,7,15,7]
-; AVX512DQBW-FAST-NEXT:    # ymm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm29, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm21, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm7, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm30, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm18, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm21, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm11, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm30, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm18, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm21, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm17
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm7, %zmm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm30, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm18, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm18, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm8, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm11, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm21, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm30, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm18, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm21, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm3, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm8, %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm11, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm18, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm18, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm20, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm7, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4]
+; AVX512DQBW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm8, %zmm13
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    movb $48, %r10b
+; AVX512DQBW-FAST-NEXT:    kmovd %r10d, %k4
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1]
+; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2]
+; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm8, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm18, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm23, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm3, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm29, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm16
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm21, %zmm16
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm8, %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm8, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm4, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm29
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm8, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm23, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm12
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm30, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm21, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm7, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm21
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm6, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm18, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm12, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm4, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm18, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r8), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm9, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm4, %zmm14
-; AVX512DQBW-FAST-NEXT:    movb $4, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm13, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm10, %zmm18
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm23, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm28
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm23, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm26
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm8, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm23, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm10, %zmm27
-; AVX512DQBW-FAST-NEXT:    movb $24, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm23, %zmm31
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm4, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm15, %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm19, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm15, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm6, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm9, %zmm14, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm9, %zmm14, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm13, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm16 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm16
 ; AVX512DQBW-FAST-NEXT:    movb $6, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k5
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm0
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm12 {%k4}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512DQBW-FAST-NEXT:    movb $64, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r8), %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r9), %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm5, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k4}
+; AVX512DQBW-FAST-NEXT:    movb $4, %sil
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm16
 ; AVX512DQBW-FAST-NEXT:    movb $12, %sil
-; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
-; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdx), %xmm5
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3}
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm11, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm5, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,5,15,u>
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm1, %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14>
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm5, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 264(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 328(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5}
-; AVX512DQBW-FAST-NEXT:    vpbroadcastq 392(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rax), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm18, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rax), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm16, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm18
+; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdx), %xmm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,9,u,6,7>
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 72(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 136(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm15
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 200(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm14
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 264(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 328(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5}
+; AVX512DQBW-FAST-NEXT:    vpbroadcastq 392(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rax), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm3, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm16
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm4, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm7, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm11, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm10, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm5, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm7, %zmm12
 ; AVX512DQBW-FAST-NEXT:    movb $8, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm21 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k3}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k3}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k3}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k4}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm26 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k3}
 ; AVX512DQBW-FAST-NEXT:    movb $-31, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdx), %xmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4}
 ; AVX512DQBW-FAST-NEXT:    movb $112, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2}
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2}
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm27
-; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2}
+; AVX512DQBW-FAST-NEXT:    vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2}
 ; AVX512DQBW-FAST-NEXT:    movb $56, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm24 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm20 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $14, %cl
 ; AVX512DQBW-FAST-NEXT:    kmovd %ecx, %k2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2}
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
@@ -20143,155 +20052,154 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512DQBW-FAST-NEXT:    movb $120, %al
 ; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm2 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
 ; AVX512DQBW-FAST-NEXT:    movb $-61, %al
 ; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm17 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 3008(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 2944(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 2880(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 2816(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 2752(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 2688(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 2624(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 2560(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 3008(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 2944(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 2880(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 2816(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, 2752(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 2688(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, 2624(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 2560(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 2496(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 2432(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 2368(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 2304(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 2240(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 2432(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 2368(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 2304(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 2240(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2176(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 2112(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 2112(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 2048(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 1984(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 1920(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 1856(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 1984(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 1920(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1856(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 1792(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 1664(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 1600(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 1536(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 1472(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1664(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 1600(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 1472(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 1216(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 1152(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, 1088(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 1024(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm5, 1024(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 896(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 768(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 704(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 640(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm2, 576(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 768(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 256(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 3520(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 3520(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 3328(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3264(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3200(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 3072(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, 3072(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3136(%rax)
-; AVX512DQBW-FAST-NEXT:    addq $6568, %rsp # imm = 0x19A8
+; AVX512DQBW-FAST-NEXT:    addq $6120, %rsp # imm = 0x17E8
 ; AVX512DQBW-FAST-NEXT:    vzeroupper
 ; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index 8bad8e79ae3616..87b5732cc1aa32 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -784,9 +784,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm8
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm8
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm11
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 (%r11), %zmm1
@@ -804,53 +804,47 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm5
 ; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm5, %ymm12
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm17
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm16
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm11
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
-; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm11, %zmm5
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm13
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
-; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm11
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm5, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm5, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm5, %zmm5
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm9, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm9, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm9
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm9, %zmm9
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm13, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm13, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm13, %zmm15
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm1, %zmm13
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm15, %zmm13
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm15
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm6
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm16, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm14, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm14, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm3, %zmm1, %zmm14
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
@@ -861,13 +855,13 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm8, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm8, %zmm10
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512F-NEXT:    vmovdqa (%rdx), %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rcx), %ymm10
+; AVX512F-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm14
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm15
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
@@ -878,7 +872,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm8, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -887,9 +881,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, 320(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 448(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -899,9 +893,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%r11), %zmm1
@@ -919,53 +913,47 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm5
 ; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm5, %ymm12
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm17
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm16
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm11
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
-; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm11, %zmm5
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm13
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
-; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm11
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm5, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm5, %zmm5
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm9, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm9
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm9, %zmm9
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm13, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm13
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm15, %zmm13
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm15
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm6
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm16, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm14
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
@@ -976,13 +964,13 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm10
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm10
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm14
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm15
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
@@ -993,7 +981,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -1002,9 +990,9 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 448(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
@@ -1894,204 +1882,196 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm26
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm17
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm27
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm28
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm16
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm31
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm7
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 (%r11), %zmm8
-; AVX512F-NEXT:    vmovdqa64 64(%r11), %zmm30
+; AVX512F-NEXT:    vmovdqa64 64(%r11), %zmm3
 ; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm9
-; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm29
+; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm4
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9]
 ; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm18, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm18, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm18, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm18, %zmm12
 ; AVX512F-NEXT:    movb $-64, %r8b
 ; AVX512F-NEXT:    kmovw %r8d, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm10, %ymm10
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm4, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm10, %ymm10
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %xmm16
+; AVX512F-NEXT:    vinserti32x4 $1, (%rdx), %ymm16, %ymm16
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm19, %zmm12, %zmm21
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm19, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm19, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm20 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm20, %zmm22
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm20, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm20, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm20, %zmm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm16, %zmm29
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm23, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm23, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm16, %zmm31
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm24, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm24, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm23, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm21
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm1
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm14, %zmm13
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm22
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm15, %zmm13
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm24
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm24, %zmm13
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm13, %zmm16
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm25, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm25, %zmm15
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm13, %zmm5
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7]
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm25, %zmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm13, %zmm2
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm4, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm18, %zmm2
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512F-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm18, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm19, %zmm13
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm19 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm19, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm20, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm20, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm20
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm20
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm28, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6]
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm28, %zmm3
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm7, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm18, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm27, %zmm16, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm0
-; AVX512F-NEXT:    vinserti128 $1, 64(%rcx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm2, %ymm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm23, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm27, %zmm16, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm11, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm26, %zmm12, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm15, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm11, %zmm15
-; AVX512F-NEXT:    vpermi2q %zmm26, %zmm12, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7]
-; AVX512F-NEXT:    vpermi2q %zmm25, %zmm11, %zmm13
-; AVX512F-NEXT:    vpermi2q %zmm26, %zmm12, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm28, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm28, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm7, %zmm12
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm3, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm11
-; AVX512F-NEXT:    vmovdqa (%rdx), %ymm12
-; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm15
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %ymm23
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm26
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm31, %zmm10, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm23, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm23, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm5 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm24, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm24, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm24
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm24, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm25, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm25, %zmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm25
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm11
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm13, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm13, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rcx), %ymm14
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %ymm23
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %ymm24
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %ymm25
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3]
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %ymm26
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %ymm27
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm30
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm10
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm12, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm6, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512F-NEXT:    vpermi2q %zmm27, %zmm16, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm3, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm10, %zmm30
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm10, %zmm16
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm16 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm13, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm16, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 704(%rax)
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm13, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm12, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 704(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 896(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 960(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 832(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 896(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 960(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, 832(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm19, 512(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, 576(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, 448(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, (%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 256(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm29, 320(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, 64(%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -2101,204 +2081,196 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 (%r11), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 64(%r11), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 64(%r11), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm4
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9]
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm18, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm18, %zmm12
 ; AVX512BW-NEXT:    movb $-64, %r8b
 ; AVX512BW-NEXT:    kmovd %r8d, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm10, %ymm10
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm12 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm10, %ymm10
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm16
+; AVX512BW-NEXT:    vinserti32x4 $1, (%rdx), %ymm16, %ymm16
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm19, %zmm12, %zmm21
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm19, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm19, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm20 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm20, %zmm22
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm20, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm20, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm20, %zmm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm16, %zmm29
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm23, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm23, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm16, %zmm31
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm24, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm24, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm23, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm21
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm1
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm13
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm22
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm13
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm24
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm24, %zmm13
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm13, %zmm16
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm25, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm25, %zmm15
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm13, %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7]
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm25, %zmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm2
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm4, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm18, %zmm2
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm5, %ymm5
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm18, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm13
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm19 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm19, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm20, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm20, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm20
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm20
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm28, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6]
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm28, %zmm3
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm18, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm16, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, 64(%rcx), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm2, %ymm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm16, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm11, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm12, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm11, %zmm15
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm12, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7]
-; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm11, %zmm13
-; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm12, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm28, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm28, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm12
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm11
-; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm12
-; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm15
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %ymm23
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm25
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm26
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm10, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm23, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm5 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm24, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm24, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm24
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm24, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm25, %zmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm25
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm11
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm14
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %ymm23
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm24
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %ymm25
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3]
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm26
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %ymm27
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm28
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm30
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm15, %zmm10
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm6, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm16, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm30
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm16, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 704(%rax)
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 896(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 960(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 832(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 896(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 960(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 768(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 832(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm19, 512(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, 576(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, (%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
@@ -4142,461 +4114,451 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-LABEL: store_i64_stride8_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512F-NEXT:    subq $2312, %rsp # imm = 0x908
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    vmovaps 128(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm25
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512F-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512F-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm13
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm2
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm21
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm23
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm28
-; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm17
-; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm16
-; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm24
-; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm22
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512F-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512F-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512F-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm23
+; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm1
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm9
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm8
+; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm18
+; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm16
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm12
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm19
+; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm25
+; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm15
+; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm20
 ; AVX512F-NEXT:    movb $-64, %r11b
 ; AVX512F-NEXT:    kmovw %r11d, %k1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm15, %zmm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm9
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm8, %zmm9
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm8, %zmm9
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm14, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm3, %zmm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm10
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm4, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm9, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm29, %zmm9
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm29, %zmm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm9, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm15, %zmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm15, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm8, %zmm5
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm8, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm4, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm3
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12]
+; AVX512F-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm27, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm27, %zmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm27, %zmm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm13, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm24, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm24, %zmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm24, %zmm11
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm14 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm14, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm22, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm22, %zmm11
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm22, %zmm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm21, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm21, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm21, %zmm3
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm27, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm27, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm24, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm24, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm24, %zmm3
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm22, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm22, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm22, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm21, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm21, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm21, %zmm3
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm27, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm27, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm24, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm24, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm22, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm22, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm29, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm29, %zmm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7]
-; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm12, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm21, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm27, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm14, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm24, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm22, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm13, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm13, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm13, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm8, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm29, %zmm1
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm24, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm22, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm21, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r10), %zmm30
-; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm30, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm31, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm21
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 192(%r10), %zmm14
-; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm8, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm31
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm27
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm26, %zmm25
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm30, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm31, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm23, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm29, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm29, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm26, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm30, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm31, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm29, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm26, %zmm17
 ; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm29, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm23
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm2, %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm23
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm29, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm30, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm31, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm23, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm26, %zmm16
 ; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm29, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm15
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6]
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm29, %zmm5
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm29, %zmm11
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7]
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm25
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6]
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm29, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm2
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm3, %zmm31, %zmm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512F-NEXT:    vmovdqa64 128(%r10), %zmm19
+; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm30, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm27, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm24, %zmm16
+; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm22, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm10
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm23, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm26, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm30, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm31, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm17
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm26, %zmm20
+; AVX512F-NEXT:    vmovdqa64 192(%r10), %zmm8
+; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm30, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm31, %zmm15
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm27
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm24
+; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm22
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm21
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm26, %zmm8
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm30
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm31
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm2, %zmm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm3, %ymm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm28, %zmm13, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm0
-; AVX512F-NEXT:    vinserti128 $1, 64(%rcx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm12
-; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm12, %ymm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm19, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm18 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rsi), %xmm0
-; AVX512F-NEXT:    vinserti128 $1, 128(%rcx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX512F-NEXT:    vinserti128 $1, 128(%rdx), %ymm12, %ymm13
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm17, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3]
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm28
+; AVX512F-NEXT:    vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm8, %zmm26
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm5, %zmm0, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm3
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm0, %ymm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rsi), %xmm7
-; AVX512F-NEXT:    vinserti128 $1, 192(%rcx), %ymm7, %ymm7
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm10
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdx), %ymm10, %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm2, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm1 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm3
+; AVX512F-NEXT:    vinserti128 $1, 64(%rcx), %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm14, %zmm29
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa 128(%rsi), %xmm3
+; AVX512F-NEXT:    vinserti128 $1, 128(%rcx), %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdx), %ymm5, %ymm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm28
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm18
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
 ; AVX512F-NEXT:    # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm6, %zmm0, %zmm10
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm30 {%k1}
+; AVX512F-NEXT:    vmovdqa 192(%rsi), %xmm4
+; AVX512F-NEXT:    vinserti128 $1, 192(%rcx), %ymm4, %ymm5
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm4
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdx), %ymm4, %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm30, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm31 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm31, %zmm31
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
 ; AVX512F-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm1 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2]
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %ymm20
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm24, %zmm3
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm9 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm2, %zmm6
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %ymm16
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %ymm20
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm5, %zmm12
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa 128(%rcx), %ymm13
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2]
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %ymm20
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512F-NEXT:    vmovdqa (%rdx), %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %ymm21
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm22
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm14
+; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %ymm21
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm22
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm4, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512F-NEXT:    vmovdqa 128(%rcx), %ymm14
+; AVX512F-NEXT:    vmovdqa 128(%rdx), %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512F-NEXT:    vmovdqa64 128(%rsi), %ymm21
 ; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm22
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm11, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm21 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rcx), %ymm9
-; AVX512F-NEXT:    vmovdqa 192(%rdx), %ymm11
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %ymm16
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm31, %zmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm17, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm20 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm23 {%k1}
+; AVX512F-NEXT:    vmovdqa 192(%rcx), %ymm14
+; AVX512F-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512F-NEXT:    vmovdqa64 192(%rsi), %ymm17
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %ymm19
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm23, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm2, %zmm2
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 1728(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 1664(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 1216(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 1152(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 704(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm15, 1984(%rax)
-; AVX512F-NEXT:    vmovaps %zmm10, 1920(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 1792(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 1536(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm18, 1472(%rax)
-; AVX512F-NEXT:    vmovaps %zmm19, 1408(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 1344(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm28, 1280(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 1664(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 1216(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 1152(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 640(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 1984(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 1920(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 1856(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 1792(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm31, 1600(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm30, 1536(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 1472(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1088(%rax)
+; AVX512F-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1024(%rax)
+; AVX512F-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 1088(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 1024(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -4605,10 +4567,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm26, 576(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm29, 512(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -4621,467 +4581,457 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512F-NEXT:    addq $2312, %rsp # imm = 0x908
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i64_stride8_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512BW-NEXT:    subq $2312, %rsp # imm = 0x908
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    vmovaps 128(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
+; AVX512BW-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512BW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm20
 ; AVX512BW-NEXT:    movb $-64, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm15, %zmm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm9
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm8, %zmm9
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm9
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm10
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm4, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm9, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm29, %zmm9
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm5
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm9, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm15, %zmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm15, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm8, %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm8, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm4, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm29, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7]
-; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm8, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm29, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 192(%r10), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm27, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm27, %zmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm27, %zmm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm13, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm24, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm24, %zmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm24, %zmm11
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm14 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm14, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm22, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm11
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm22, %zmm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm21, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm21, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm21, %zmm3
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm27, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm27, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm24, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm24, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm24, %zmm3
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm22, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm22, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm22, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm21, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm21, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm21, %zmm3
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm27, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm27, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm24, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm22, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm22, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm27, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm24, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm22, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm27, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm27
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm22, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm30, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm31, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm25
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm30, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm31, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm23, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm26, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm30, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm31, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm17
 ; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm29, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm23
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm29, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm30, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm31, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm23, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm29, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6]
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm29, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm29, %zmm11
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7]
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6]
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm29, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm27, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm24, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm22, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm10
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm23, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm17
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm20[1],zmm1[1],zmm20[3],zmm1[3],zmm20[5],zmm1[5],zmm20[7],zmm1[7]
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 192(%r10), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm15
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm27
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm22
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm21
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm8
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm30
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm31
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm2[1],zmm0[1],zmm2[3],zmm0[3],zmm2[5],zmm0[5],zmm2[7],zmm0[7]
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm3, %ymm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm28, %zmm13, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm12 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, 64(%rcx), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm12, %ymm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm19, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rsi), %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, 128(%rcx), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, 128(%rdx), %ymm12, %ymm13
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm17, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3]
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm28
+; AVX512BW-NEXT:    vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm7 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm8, %zmm26
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm5, %zmm0, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm3
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm0, %ymm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rsi), %xmm7
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rcx), %ymm7, %ymm7
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm10
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdx), %ymm10, %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm2, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rcx), %ymm3, %ymm3
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm5, %ymm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm14, %zmm29
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa 128(%rsi), %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rcx), %ymm3, %ymm3
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdx), %ymm5, %ymm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm28
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm18
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm6, %zmm0, %zmm10
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm10, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm30 {%k1}
+; AVX512BW-NEXT:    vmovdqa 192(%rsi), %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rcx), %ymm4, %ymm5
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdx), %ymm4, %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm30, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm31 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm31, %zmm31
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2]
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm20
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm24, %zmm3
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm9 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm2, %zmm6
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %ymm16
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %ymm20
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm5, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa 128(%rcx), %ymm13
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2]
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %ymm20
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm21, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm21
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm22
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm14
+; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %ymm21
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm22
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm25, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm4, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa 128(%rcx), %ymm14
+; AVX512BW-NEXT:    vmovdqa 128(%rdx), %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %ymm21
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm22
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm11, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm21 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm21, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rcx), %ymm9
-; AVX512BW-NEXT:    vmovdqa 192(%rdx), %ymm11
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %ymm16
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm29, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm31, %zmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm17, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm20 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm23 {%k1}
+; AVX512BW-NEXT:    vmovdqa 192(%rcx), %ymm14
+; AVX512BW-NEXT:    vmovdqa 192(%rdx), %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %ymm17
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm19
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm23, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 1728(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 1664(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 1216(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 1152(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 704(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 1984(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm10, 1920(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1792(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1536(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1472(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm19, 1408(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1344(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, 1280(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 1664(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 1216(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 1152(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 704(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 1984(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 1920(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1856(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 1792(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 1600(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1536(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1472(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 1408(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1088(%rax)
+; AVX512BW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1024(%rax)
+; AVX512BW-NEXT:    vmovaps %zmm0, 1280(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1088(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 1024(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -5090,10 +5040,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 576(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 576(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 512(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -5106,7 +5054,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512BW-NEXT:    addq $2312, %rsp # imm = 0x908
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
@@ -8878,1977 +8826,1895 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-LABEL: store_i64_stride8_vf64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $5512, %rsp # imm = 0x1588
+; AVX512F-NEXT:    subq $5384, %rsp # imm = 0x1508
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm22
-; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm5
-; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm10
-; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm20
-; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm25
-; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm23
-; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm28
-; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm26
-; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm24
-; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm14
-; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm27
-; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm16
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm10
+; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm12
+; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm3
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm9
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm15
+; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm30
+; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm18
+; AVX512F-NEXT:    vmovdqa64 128(%r8), %zmm11
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm24
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm28
+; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm26
+; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm31
+; AVX512F-NEXT:    vmovdqa64 128(%r10), %zmm16
+; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm17
+; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm27
+; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512F-NEXT:    movb $-64, %r11b
 ; AVX512F-NEXT:    kmovw %r11d, %k1
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm3, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm3, %zmm0
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm6, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm9, %zmm0
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm9, %zmm0
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm7, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm13, %zmm12
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm18, %zmm12
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7]
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm18, %zmm8
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm30, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
+; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm19, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm19, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm19, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm20 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm20, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm21, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm3, %zmm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm6, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm21, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm21, %zmm8
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm20 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm20, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm9, %zmm8
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm19, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm9, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm7, %zmm11
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm19, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm19, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm21, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm13, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm13, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm14, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%r10), %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm18, %zmm4
-; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm30, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm18, %zmm6
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm3, %zmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512F-NEXT:    vmovdqa64 128(%rdx), %zmm4
-; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm21, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm21, %zmm8
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm14, %zmm9, %zmm5
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm23, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm19, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm9, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm13, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm30, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm18, %zmm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 192(%r10), %zmm10
-; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm26
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm11, %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm23
-; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm3
-; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm9, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm9, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm13, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm30, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm18, %zmm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512F-NEXT:    vmovdqa64 192(%rdx), %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
+; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 256(%r10), %zmm10
-; AVX512F-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm11, %zmm2
-; AVX512F-NEXT:    vmovdqa64 256(%r8), %zmm0
-; AVX512F-NEXT:    vmovdqa64 256(%r9), %zmm24
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm0
+; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm19, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm19, %zmm6
+; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm8
+; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm21, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm21, %zmm6
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm23, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm23, %zmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 256(%rdx), %zmm1
+; AVX512F-NEXT:    vmovdqa64 256(%rcx), %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm19, %zmm2
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm4
 ; AVX512F-NEXT:    vmovdqa64 256(%rsi), %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm6
-; AVX512F-NEXT:    vmovdqa64 256(%rdx), %zmm3
-; AVX512F-NEXT:    vmovdqa64 256(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm9, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm9, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm12, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm13, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm18, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm30, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm18, %zmm3
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 320(%rsi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm11, %zmm5
-; AVX512F-NEXT:    vmovdqa64 320(%rdx), %zmm14
-; AVX512F-NEXT:    vmovdqa64 320(%rcx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm12, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm30, %zmm14
-; AVX512F-NEXT:    vmovdqa64 384(%rdx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 384(%rcx), %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm12, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm30, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%rdx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 448(%rcx), %zmm6
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm15
-; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm19, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 256(%r10), %zmm13
+; AVX512F-NEXT:    vmovdqa64 256(%rax), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm19, %zmm7
+; AVX512F-NEXT:    vmovdqa64 256(%r8), %zmm18
+; AVX512F-NEXT:    vmovdqa64 256(%r9), %zmm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm21, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm21, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm21, %zmm7
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm23, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm23, %zmm7
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm25, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm25, %zmm3
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm30, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%r10), %zmm31
-; AVX512F-NEXT:    vmovdqa64 320(%rax), %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm11, %zmm1
-; AVX512F-NEXT:    vmovdqa64 320(%r8), %zmm17
-; AVX512F-NEXT:    vmovdqa64 320(%r9), %zmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm9, %zmm1
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm9, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm13, %zmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm18, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm18, %zmm2
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
+; AVX512F-NEXT:    vmovdqa64 320(%rdx), %zmm1
+; AVX512F-NEXT:    vmovdqa64 320(%rcx), %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm19, %zmm3
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 320(%rsi), %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm19, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 320(%r10), %zmm16
+; AVX512F-NEXT:    vmovdqa64 320(%rax), %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm19, %zmm12
+; AVX512F-NEXT:    vmovdqa64 320(%r8), %zmm29
+; AVX512F-NEXT:    vmovdqa64 320(%r9), %zmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm14, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm21, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm21, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm21, %zmm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm14, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm13, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm23, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm23, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm23, %zmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm12, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm18, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%r10), %zmm11
-; AVX512F-NEXT:    vmovdqa64 384(%rax), %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm9, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 384(%r8), %zmm30
-; AVX512F-NEXT:    vmovdqa64 384(%r9), %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm13, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%r10), %zmm16
-; AVX512F-NEXT:    vmovdqa64 448(%rax), %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm2, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm25, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm25, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%rdx), %zmm3
+; AVX512F-NEXT:    vmovdqa64 384(%rcx), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512F-NEXT:    vmovdqa64 448(%rcx), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm9, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm3, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 448(%r8), %zmm19
-; AVX512F-NEXT:    vmovdqa64 448(%r9), %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm13, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm3, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm18, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm11, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm10, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm4, %zmm26
+; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm18, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm11, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm15, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm12, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm14, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm10, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm4, %zmm30
+; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm11, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm1, %zmm21
-; AVX512F-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm6, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm15, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm1, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm10, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm4, %zmm31
+; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm11, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm10, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm4, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm6, %zmm18
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm9
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm28
+; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm6, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm27
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm11, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm12, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm10, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm4, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm14, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm6, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm23
-; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm15, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm23
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm12, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm6, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm15, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm1, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm15, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm14, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm6, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm21
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm14, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm20
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7]
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm10, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm4, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm10, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm4, %zmm18
+; AVX512F-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm15, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm11, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm14, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm12, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm6, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm1, %zmm16
-; AVX512F-NEXT:    vpermi2q %zmm7, %zmm19, %zmm15
-; AVX512F-NEXT:    vpermi2q %zmm7, %zmm19, %zmm14
-; AVX512F-NEXT:    vpermi2q %zmm7, %zmm19, %zmm6
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7]
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm19
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm10, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm4, %zmm16
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm12, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm10, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm4, %zmm29
+; AVX512F-NEXT:    vmovdqa64 384(%r10), %zmm13
+; AVX512F-NEXT:    vmovdqa64 384(%rax), %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm12, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm19, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm21, %zmm15
+; AVX512F-NEXT:    vmovdqa64 384(%r8), %zmm5
+; AVX512F-NEXT:    vmovdqa64 384(%r9), %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm23, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm25, %zmm6
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm10, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm12, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm17
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7]
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm4, %zmm5
+; AVX512F-NEXT:    vmovdqa64 448(%r10), %zmm16
+; AVX512F-NEXT:    vmovdqa64 448(%rax), %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm11, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm16, %zmm19
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm16, %zmm21
+; AVX512F-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512F-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm25
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm4, %zmm16
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm11
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm10
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm1 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm25, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
 ; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm2
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm18, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm31, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
 ; AVX512F-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 128(%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 128(%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm13, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 128(%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm23
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm25
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
 ; AVX512F-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 192(%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm20, %zmm21
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm9
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm19
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
 ; AVX512F-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 256(%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa 256(%rdi), %ymm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm26, %zmm26
+; AVX512F-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa 256(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm15
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm7
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm28
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
 ; AVX512F-NEXT:    vmovdqa 320(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 320(%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 320(%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %ymm23
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm24, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm17 {%k1}
+; AVX512F-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa 320(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm22, %zmm22
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm20 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm27
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm17 {%k1}
 ; AVX512F-NEXT:    vmovdqa 384(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 384(%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 384(%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %ymm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm20, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm30 {%k1}
+; AVX512F-NEXT:    vmovdqa 384(%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm6 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm10 {%k1}
 ; AVX512F-NEXT:    vmovdqa 448(%rcx), %ymm0
 ; AVX512F-NEXT:    vmovdqa 448(%rdx), %ymm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-NEXT:    vmovdqa 448(%rsi), %ymm3
-; AVX512F-NEXT:    vmovdqa 448(%rdi), %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm31
+; AVX512F-NEXT:    vmovdqa 448(%rsi), %ymm4
+; AVX512F-NEXT:    vmovdqa 448(%rdi), %ymm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm10
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm19 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm3 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm9
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm0, %ymm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
 ; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, 64(%rcx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm4, %ymm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm3, %ymm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm11
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm6, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
 ; AVX512F-NEXT:    vmovdqa 128(%rsi), %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, 128(%rcx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX512F-NEXT:    vinserti128 $1, 128(%rdx), %ymm4, %ymm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm18 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm19
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdx), %ymm4, %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm16 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm16, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k1}
 ; AVX512F-NEXT:    vmovdqa 192(%rsi), %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, 192(%rcx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX512F-NEXT:    vinserti128 $1, 192(%rdx), %ymm12, %ymm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm18, %zmm30
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512F-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm18 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa 256(%rsi), %xmm12
-; AVX512F-NEXT:    vinserti128 $1, 256(%rcx), %ymm12, %ymm13
-; AVX512F-NEXT:    vmovdqa 256(%rdi), %xmm12
-; AVX512F-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm5, %zmm12
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm5 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm23
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm29 {%k1}
-; AVX512F-NEXT:    vmovdqa 320(%rsi), %xmm13
-; AVX512F-NEXT:    vinserti128 $1, 320(%rcx), %ymm13, %ymm13
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %xmm18
-; AVX512F-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm29, %zmm22
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm13
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm21 {%k1}
-; AVX512F-NEXT:    vmovdqa64 384(%rsi), %xmm18
-; AVX512F-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %xmm25
-; AVX512F-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm21, %zmm16
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm28, %zmm21
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
-; AVX512F-NEXT:    vmovdqa64 448(%rsi), %xmm18
-; AVX512F-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %xmm25
-; AVX512F-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm14, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
+; AVX512F-NEXT:    vmovdqa64 256(%rsi), %xmm16
+; AVX512F-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %xmm17
+; AVX512F-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm20, %zmm29
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm20 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm16, %zmm20, %zmm30
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
+; AVX512F-NEXT:    vmovdqa64 320(%rsi), %xmm16
+; AVX512F-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %xmm17
+; AVX512F-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm20, %zmm18
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm31 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm16, %zmm31, %zmm16
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm24 {%k1}
+; AVX512F-NEXT:    vmovdqa64 384(%rsi), %xmm17
+; AVX512F-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %xmm20
+; AVX512F-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm24
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm26 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm17, %zmm26, %zmm17
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
+; AVX512F-NEXT:    vmovdqa64 448(%rsi), %xmm20
+; AVX512F-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %xmm26
+; AVX512F-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm31, %zmm11, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm12 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm12, %zmm12
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 3776(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm31, 3712(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, 3264(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, 3200(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 2752(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm7, 2240(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 2176(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, 1728(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1664(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1216(%rax)
-; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1152(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 704(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 640(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 4032(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3968(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3904(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3840(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3520(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3456(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3392(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3328(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm21, 3136(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 3072(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 3008(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2944(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2880(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2816(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 2624(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2496(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2432(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2368(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm12, 2048(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1984(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1920(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1856(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 3712(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 3264(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 3200(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 2752(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 2688(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm28, 2240(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 2176(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 1728(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, 1664(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, 1216(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm23, 1152(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, 704(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 640(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 192(%rax)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 128(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 4032(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3968(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3904(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3840(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 3648(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 3584(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3520(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3456(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3392(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3328(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 3136(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 3072(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 3008(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2944(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2880(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2816(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 2624(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 2560(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2496(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2432(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2368(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 2304(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm30, 2112(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 1984(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 1920(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 1856(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm8, 1792(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm30, 1536(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1472(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1408(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1344(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 1088(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 1472(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 1408(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 1344(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 1280(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 1088(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, 1024(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 960(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 896(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, 576(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 512(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 960(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 896(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 832(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 768(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 576(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 448(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 384(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 320(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 256(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512F-NEXT:    addq $5512, %rsp # imm = 0x1588
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512F-NEXT:    addq $5384, %rsp # imm = 0x1508
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i64_stride8_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $5512, %rsp # imm = 0x1588
+; AVX512BW-NEXT:    subq $5384, %rsp # imm = 0x1508
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm25
-; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm15
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm24
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm26
+; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm13
 ; AVX512BW-NEXT:    movb $-64, %r11b
 ; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm3, %zmm0
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm9, %zmm0
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm9, %zmm0
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm7, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm29
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm12
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm12
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7]
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm18, %zmm8
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm30, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm19, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm19, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm19, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm20 = zmm30[0],zmm24[0],zmm30[2],zmm24[2],zmm30[4],zmm24[4],zmm30[6],zmm24[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm20 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm20, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm21, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm3, %zmm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm21, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm21, %zmm8
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm20 = zmm30[1],zmm24[1],zmm30[3],zmm24[3],zmm30[5],zmm24[5],zmm30[7],zmm24[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm20 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm20, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm9, %zmm8
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm23, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm23, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm25, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm25, %zmm8
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm19, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm9, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm7, %zmm11
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm19, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm19, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm21, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm13, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%r10), %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm18, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm30, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm18, %zmm6
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm21, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm21, %zmm8
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm18[1],zmm28[1],zmm18[3],zmm28[3],zmm18[5],zmm28[5],zmm18[7],zmm28[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm9, %zmm5
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm25, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm25, %zmm5
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm19, %zmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm9, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 192(%r10), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm11, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm23
-; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm9, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm21, %zmm5
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm18, %zmm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm23, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm25, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 256(%r10), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 256(%r8), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 256(%r9), %zmm24
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 192(%r10), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm19, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm8[0],zmm0[0],zmm8[2],zmm0[2],zmm8[4],zmm0[4],zmm8[6],zmm0[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm21, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm21, %zmm6
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm8[1],zmm0[1],zmm8[3],zmm0[3],zmm8[5],zmm0[5],zmm8[7],zmm0[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm23, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm23, %zmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm19, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 256(%rsi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 256(%rdx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 256(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm9, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm9, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm12, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm18, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm30, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm18, %zmm3
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %zmm14
-; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm12, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm30, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 384(%rdx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm12, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm30, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%rdx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %zmm6
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm15
-; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm19, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 256(%r10), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 256(%rax), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm19, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 256(%r8), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 256(%r9), %zmm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 = zmm18[0],zmm15[0],zmm18[2],zmm15[2],zmm18[4],zmm15[4],zmm18[6],zmm15[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm21, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm21, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm21, %zmm7
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm18[1],zmm15[1],zmm18[3],zmm15[3],zmm18[5],zmm15[5],zmm18[7],zmm15[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm23, %zmm7
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm13[0],zmm2[0],zmm13[2],zmm2[2],zmm13[4],zmm2[4],zmm13[6],zmm2[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm7, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm25, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm25, %zmm3
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm13[1],zmm2[1],zmm13[3],zmm2[3],zmm13[5],zmm2[5],zmm13[7],zmm2[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm30, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%r10), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 320(%rax), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 320(%r8), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 320(%r9), %zmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm9, %zmm1
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm9, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm18, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm18, %zmm2
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 320(%rdx), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 320(%rcx), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm19, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 320(%r10), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 320(%rax), %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm19, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 320(%r8), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 320(%r9), %zmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm29[0],zmm5[0],zmm29[2],zmm5[2],zmm29[4],zmm5[4],zmm29[6],zmm5[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm14, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm21, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm21, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm21, %zmm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm5[1],zmm29[3],zmm5[3],zmm29[5],zmm5[5],zmm29[7],zmm5[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm14, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm23, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm23, %zmm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm12, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%r10), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 384(%rax), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm9, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 384(%r8), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 384(%r9), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm13, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%r10), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 448(%rax), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm25, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm25, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm25, %zmm4
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 384(%rcx), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 448(%rdx), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 448(%rcx), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm9, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 448(%r8), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 448(%r9), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm21, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm25, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm4, %zmm26
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm18, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm11, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm15, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm14, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm10, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm4, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm11, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm6, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm27 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm4, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm11, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm10, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm4, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm18
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm9
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm15, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm28
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm27
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm11, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm10, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm4, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm6, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm15, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm20
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7]
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm10, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm4, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm14, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm6, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm16
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm19, %zmm15
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm19, %zmm14
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm19, %zmm6
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7]
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm19
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm10, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm4, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm4, %zmm29
+; AVX512BW-NEXT:    vmovdqa64 384(%r10), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 384(%rax), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm12, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm21, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 384(%r8), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 384(%r9), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm23, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm25, %zmm6
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm13[1],zmm0[1],zmm13[3],zmm0[3],zmm13[5],zmm0[5],zmm13[7],zmm0[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm17
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm2[1],zmm5[3],zmm2[3],zmm5[5],zmm2[5],zmm5[7],zmm2[7]
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 448(%r10), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 448(%rax), %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm19
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm16, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 448(%r8), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 448(%r9), %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm16[0],zmm1[0],zmm16[2],zmm1[2],zmm16[4],zmm1[4],zmm16[6],zmm1[6]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm25
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm16[1],zmm1[1],zmm16[3],zmm1[3],zmm16[5],zmm1[5],zmm16[7],zmm1[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm16
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm11
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm10
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7]
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm9 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm1 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm25, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm31 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 64(%rsi), %ymm2
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm18, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm31, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 128(%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 128(%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm13, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 128(%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm23
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm25
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 192(%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa 192(%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm20, %zmm21
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm9
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm19
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 256(%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm26, %zmm26
+; AVX512BW-NEXT:    vmovdqa 256(%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm7
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm28
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 320(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 320(%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 320(%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %ymm23
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm24, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa 320(%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm22, %zmm22
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm20 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm17 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 384(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 384(%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 384(%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %ymm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm20, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm30 {%k1}
+; AVX512BW-NEXT:    vmovdqa 384(%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm6 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm10 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 448(%rcx), %ymm0
 ; AVX512BW-NEXT:    vmovdqa 448(%rdx), %ymm1
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-NEXT:    vmovdqa 448(%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa 448(%rdi), %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm31
+; AVX512BW-NEXT:    vmovdqa 448(%rsi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 448(%rdi), %ymm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm10
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm19 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm3 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm0, %ymm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm10, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm1
 ; AVX512BW-NEXT:    vinserti128 $1, 64(%rcx), %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm4, %ymm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm11, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm3, %ymm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm11
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm6, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 128(%rsi), %xmm1
 ; AVX512BW-NEXT:    vinserti128 $1, 128(%rcx), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm4
-; AVX512BW-NEXT:    vinserti128 $1, 128(%rdx), %ymm4, %ymm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm13, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm19
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdx), %ymm4, %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm16 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm16, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 192(%rsi), %xmm1
 ; AVX512BW-NEXT:    vinserti128 $1, 192(%rcx), %ymm1, %ymm1
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, 192(%rdx), %ymm12, %ymm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm18, %zmm30
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm7
+; AVX512BW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm7, %ymm16
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm16[0],ymm1[0],ymm16[2],ymm1[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm16[1],ymm1[1],ymm16[3],ymm1[3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm18, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa 256(%rsi), %xmm12
-; AVX512BW-NEXT:    vinserti128 $1, 256(%rcx), %ymm12, %ymm13
-; AVX512BW-NEXT:    vmovdqa 256(%rdi), %xmm12
-; AVX512BW-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm5, %zmm12
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm5 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm23
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm29 {%k1}
-; AVX512BW-NEXT:    vmovdqa 320(%rsi), %xmm13
-; AVX512BW-NEXT:    vinserti128 $1, 320(%rcx), %ymm13, %ymm13
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %xmm18
-; AVX512BW-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm29, %zmm22
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm21 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %xmm18
-; AVX512BW-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %xmm25
-; AVX512BW-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm21, %zmm16
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm28 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm28, %zmm21
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %xmm18
-; AVX512BW-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %xmm25
-; AVX512BW-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm14, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 256(%rsi), %xmm16
+; AVX512BW-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm16, %ymm16
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm17, %ymm17
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm20, %zmm29
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm20 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm16, %zmm20, %zmm30
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 320(%rsi), %xmm16
+; AVX512BW-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm16, %ymm16
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm17, %ymm17
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm20, %zmm18
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm31 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm16 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm16, %zmm31, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm24 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 384(%rsi), %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm17, %ymm17
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %xmm20
+; AVX512BW-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm20, %ymm20
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm20[0],ymm17[0],ymm20[2],ymm17[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm24
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm26 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm17 = ymm20[1],ymm17[1],ymm20[3],ymm17[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm17, %zmm26, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 448(%rsi), %xmm20
+; AVX512BW-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm20, %ymm20
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %xmm26
+; AVX512BW-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm26, %ymm26
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm26[0],ymm20[0],ymm26[2],ymm20[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm31, %zmm11, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm12 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm20 = ymm26[1],ymm20[1],ymm26[3],ymm20[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm12, %zmm12
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 3776(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, 3712(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, 3264(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 3200(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 2752(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, 2240(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 2176(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 1728(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1664(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1216(%rax)
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1152(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 704(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 640(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 4032(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3968(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3904(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3840(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3520(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3456(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3392(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3328(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, 3136(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 3072(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 3008(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2944(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2880(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2816(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 2624(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2496(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2432(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2368(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, 2048(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1984(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1920(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1856(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1792(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 3712(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 3264(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 3200(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 2752(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 2688(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 2240(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 2176(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1728(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1664(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1216(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1152(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 704(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 640(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 192(%rax)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 128(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 4032(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3968(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3904(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3840(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 3648(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 3584(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3520(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3456(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3392(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3328(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 3136(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 3072(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 3008(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2944(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2880(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2816(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 2624(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 2560(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2496(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2432(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2368(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 2304(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 2112(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 1984(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 1920(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 1856(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm8, 1792(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1536(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1472(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1408(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1344(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1088(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 1472(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 1408(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 1344(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 1280(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 1088(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, 1024(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 960(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 896(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 832(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, 576(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 512(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 960(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 896(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 832(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 768(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 576(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 448(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 384(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 320(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 256(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-NEXT:    addq $5512, %rsp # imm = 0x1588
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
+; AVX512BW-NEXT:    addq $5384, %rsp # imm = 0x1508
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64


        


More information about the llvm-commits mailing list