[llvm] 2773098 - [X86] combineConcatVectorOps - add basic concat(unpack(x,y),unpack(z,w)) -> unpack(concat(x,z),concat(y,w)) handling

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 24 05:53:14 PDT 2023


Author: Simon Pilgrim
Date: 2023-07-24T13:50:15+01:00
New Revision: 2773098ee3187d5f9daca8938d57657dd89dd36f

URL: https://github.com/llvm/llvm-project/commit/2773098ee3187d5f9daca8938d57657dd89dd36f
DIFF: https://github.com/llvm/llvm-project/commit/2773098ee3187d5f9daca8938d57657dd89dd36f.diff

LOG: [X86] combineConcatVectorOps - add basic concat(unpack(x,y),unpack(z,w)) -> unpack(concat(x,z),concat(y,w)) handling

Very limited support as we don't want to interfere with build_vector patterns

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx512fp16-frem.ll
    llvm/test/CodeGen/X86/vector-half-conversions.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
    llvm/test/CodeGen/X86/vector-reduce-ctpop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7be328b978b30c..a1ca3fe6048c1a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57272,6 +57272,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       }
       break;
     }
+    case X86ISD::UNPCKH:
+    case X86ISD::UNPCKL: {
+      // Don't concatenate build_vector patterns.
+      if (!IsSplat && VT.getScalarSizeInBits() >= 32 &&
+          ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+           (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
+          none_of(Ops, [](SDValue Op) {
+            return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
+                       ISD::SCALAR_TO_VECTOR ||
+                   peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
+                       ISD::SCALAR_TO_VECTOR;
+          })) {
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           ConcatSubOperand(VT, Ops, 0),
+                           ConcatSubOperand(VT, Ops, 1));
+      }
+      break;
+    }
     case X86ISD::PSHUFHW:
     case X86ISD::PSHUFLW:
     case X86ISD::PSHUFD:

diff  --git a/llvm/test/CodeGen/X86/avx512fp16-frem.ll b/llvm/test/CodeGen/X86/avx512fp16-frem.ll
index 887deee6d75da6..57ee3468e196bf 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-frem.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-frem.ll
@@ -285,7 +285,7 @@ define <8 x half> @frem_vec8(<8 x half> %x, <8 x half> %y) nounwind {
 define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
 ; CHECK-LABEL: frem_vec16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $152, %rsp
+; CHECK-NEXT:    subq $184, %rsp
 ; CHECK-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
@@ -311,108 +311,90 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,0]
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -430,6 +412,24 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq fmodf at PLT
+; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    callq fmodf at PLT
+; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
@@ -438,21 +438,21 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT:    addq $152, %rsp
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    addq $184, %rsp
 ; CHECK-NEXT:    retq
   %r = frem <16 x half> %x, %y
   ret <16 x half> %r
@@ -461,7 +461,7 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind {
 define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
 ; CHECK-LABEL: frem_vec32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $248, %rsp
+; CHECK-NEXT:    subq $408, %rsp # imm = 0x198
 ; CHECK-NEXT:    vmovupd %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; CHECK-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
@@ -469,7 +469,7 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
 ; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
-; CHECK-NEXT:    vmovapd %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vzeroupper
@@ -479,7 +479,7 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
 ; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
@@ -487,85 +487,94 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
-; CHECK-NEXT:    vmovapd %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
@@ -575,208 +584,196 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
 ; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vmovapd %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,0]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[3,3,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,0]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm1 = mem[1,1,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
@@ -794,13 +791,13 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind {
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; CHECK-NEXT:    addq $248, %rsp
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    addq $408, %rsp # imm = 0x198
 ; CHECK-NEXT:    retq
   %r = frem <32 x half> %x, %y
   ret <32 x half> %r
@@ -984,7 +981,7 @@ define <8 x half> @frem_strict_vec8(<8 x half> %x, <8 x half> %y) nounwind #0 {
 define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind #0 {
 ; CHECK-LABEL: frem_strict_vec16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $152, %rsp
+; CHECK-NEXT:    subq $184, %rsp
 ; CHECK-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
@@ -1010,108 +1007,90 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind #
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    callq fmodf at PLT
-; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
 ; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -1129,10 +1108,11 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind #
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
@@ -1140,27 +1120,44 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind #
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT:    addq $152, %rsp
-; CHECK-NEXT:    retq
-  %result = call <16 x half> @llvm.experimental.constrained.frem.v16f16(<16 x half> %x, <16 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
-  ret <16 x half> %result
-}
-
-define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #0 {
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq fmodf at PLT
+; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    callq fmodf at PLT
+; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    addq $184, %rsp
+; CHECK-NEXT:    retq
+  %result = call <16 x half> @llvm.experimental.constrained.frem.v16f16(<16 x half> %x, <16 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #0 {
 ; CHECK-LABEL: frem_strict_vec32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $248, %rsp
+; CHECK-NEXT:    subq $408, %rsp # imm = 0x198
 ; CHECK-NEXT:    vmovupd %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
@@ -1168,7 +1165,7 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #
 ; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
@@ -1178,7 +1175,7 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #
 ; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
@@ -1186,85 +1183,94 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
@@ -1274,208 +1280,196 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #
 ; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    callq fmodf at PLT
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; CHECK-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm1
@@ -1493,13 +1487,13 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind #
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; CHECK-NEXT:    addq $248, %rsp
+; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    addq $408, %rsp # imm = 0x198
 ; CHECK-NEXT:    retq
   %result = call <32 x half> @llvm.experimental.constrained.frem.v32f16(<32 x half> %x, <32 x half> %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <32 x half> %result

diff  --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 2a06e91c25af75..818480854c9df3 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -2475,7 +2475,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
 ;
 ; AVX2-LABEL: cvt_16f32_to_16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    subq $120, %rsp
+; AVX2-NEXT:    subq $184, %rsp
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
@@ -2483,63 +2483,74 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -2547,19 +2558,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,0]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
@@ -2570,12 +2569,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX2-NEXT:    addq $120, %rsp
+; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-NEXT:    addq $184, %rsp
 ; AVX2-NEXT:    retq
 ;
 ; F16C-LABEL: cvt_16f32_to_16i16:
@@ -2961,7 +2960,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
 ; AVX2-LABEL: store_cvt_16f32_to_16i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $112, %rsp
+; AVX2-NEXT:    subq $176, %rsp
 ; AVX2-NEXT:    movq %rdi, %rbx
 ; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2970,63 +2969,74 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -3034,19 +3044,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,0]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
@@ -3057,13 +3055,13 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rbx)
-; AVX2-NEXT:    addq $112, %rsp
+; AVX2-NEXT:    addq $176, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -4688,7 +4686,7 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-LABEL: store_cvt_32f32_to_32f16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $176, %rsp
+; AVX2-NEXT:    subq $240, %rsp
 ; AVX2-NEXT:    movq %rdi, %rbx
 ; AVX2-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4705,8 +4703,23 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -4714,11 +4727,22 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -4727,35 +4751,20 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -4763,19 +4772,7 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,0]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX2-NEXT:    vzeroupper
@@ -4786,11 +4783,11 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
@@ -4804,20 +4801,12 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -4825,26 +4814,32 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4854,26 +4849,27 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
-; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
-; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX2-NEXT:    callq __truncsfhf2 at PLT
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -4885,15 +4881,15 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind {
 ; AVX2-NEXT:    callq __truncsfhf2 at PLT
 ; AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%rbx)
 ; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-NEXT:    vmovaps %ymm0, (%rbx)
-; AVX2-NEXT:    addq $176, %rsp
+; AVX2-NEXT:    addq $240, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 108aa91a52ce77..5ab0db77a9a065 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -375,60 +375,46 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX2-ONLY-LABEL: load_i64_stride4_vf8:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm15[0],xmm14[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm11[1]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm7[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm13[0],ymm10[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm11
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm11, %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm12, %ymm12
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm13[1],ymm10[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-ONLY-NEXT:    vmovaps %ymm8, (%rcx)
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, (%rsi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 32(%rdx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, (%rdx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 32(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, (%rcx)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, (%r8)
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -780,146 +766,116 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-LABEL: load_i64_stride4_vf16:
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    subq $328, %rsp # imm = 0x148
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm12[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm12[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm15[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm15[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm2, %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm2, %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm8[0],ymm2[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm10[0],ymm15[0],ymm10[2],ymm15[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm7, %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm7, %ymm12
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm6[0],ymm3[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm12 = ymm12[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm15[1],ymm10[3],ymm15[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 64(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 80(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 112(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 96(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 64(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 80(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 96(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 112(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 64(%rcx)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, (%rcx)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 96(%rcx)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 32(%rcx)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdi), %ymm13, %ymm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdi), %ymm8, %ymm14
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm15
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 352(%rdi), %ymm15, %ymm15
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm12[0],ymm5[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, %ymm11
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, %ymm14
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm15[1],ymm5[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm5[1],ymm14[1],ymm5[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm9[1],ymm7[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 64(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 64(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rdx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 64(%rcx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%rcx)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%rcx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 32(%r8)
 ; AVX2-ONLY-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
@@ -1650,113 +1606,81 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-LABEL: load_i64_stride4_vf32:
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    subq $1224, %rsp # imm = 0x4C8
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 736(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 736(%rdi), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 704(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 992(%rdi), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 960(%rdi), %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 352(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 608(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 576(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 864(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 832(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1766,9 +1690,9 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1778,165 +1702,132 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm14[0],ymm10[2],ymm14[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm14[0],ymm13[2],ymm14[2]
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm13[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm15[0],ymm8[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm13[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %ymm7
 ; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm15[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm9 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm6[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm15[1],ymm8[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -1954,12 +1845,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rcx)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 128(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm9, 64(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm13, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm15, 224(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 128(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 64(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, (%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 160(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -3434,202 +3324,138 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-LABEL: load_i64_stride4_vf64:
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    subq $3016, %rsp # imm = 0xBC8
-; AVX2-ONLY-NEXT:    vmovaps 1248(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1216(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1504(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps 1472(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1760(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps 1728(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1440(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps 1696(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps 1952(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 1920(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 2016(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps 1984(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1664(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1408(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1184(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1152(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 736(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1888(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1824(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1632(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1568(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1376(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1312(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 736(%rdi), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 704(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 992(%rdi), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 960(%rdi), %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1184(%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1248(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 1152(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1216(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1440(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1504(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1408(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1472(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1696(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1760(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1664(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1728(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1952(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1920(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1984(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 352(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1120(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1088(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1056(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 608(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 576(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 864(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 832(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1056(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1088(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1312(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1376(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1344(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1568(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1632(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1600(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1824(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1856(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm3
@@ -3815,11 +3641,11 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm12 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
@@ -3850,8 +3676,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
@@ -3863,7 +3688,8 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
@@ -3875,19 +3701,19 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm5[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3]
@@ -3908,134 +3734,70 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 464(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 448(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 256(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 384(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 320(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 192(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 128(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 64(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 272(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 400(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 336(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 208(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 144(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 80(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 496(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 480(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 416(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 352(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 288(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 224(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 160(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 96(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 432(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 368(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 304(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 240(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 176(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 112(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 128(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 144(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 256(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 272(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 64(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 80(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 192(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 208(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 320(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 336(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 384(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 400(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 448(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 464(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 96(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 112(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 160(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 176(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 224(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 240(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 288(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 304(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 352(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 368(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 416(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 432(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 480(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 496(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 448(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 384(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 320(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 256(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 192(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 128(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 64(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 480(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 416(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 352(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 288(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 224(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 160(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 448(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 384(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 320(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 256(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 192(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 128(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 64(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 480(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 416(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 352(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 288(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 224(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 160(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm2, 448(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -4072,21 +3834,21 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%r8)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm4, 416(%r8)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm5, 384(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 352(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm6, 320(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm9, 288(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 352(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 320(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 288(%r8)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm7, 256(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm10, 224(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm15, 192(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 160(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r8)

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index d04f5872da71b2..b84fff274cd716 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -301,45 +301,37 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm5
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm6
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm14 = xmm11[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm15 = xmm12[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm12[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdi), %ymm9, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, (%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, (%rsi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, (%rdx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, (%rcx)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm4, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, (%r11)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 16(%r11)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, (%r10)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, (%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, (%r11)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, (%r10)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
@@ -353,57 +345,53 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm4
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm5
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm6
-; AVX512-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX512-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX512-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vmovaps (%rdi), %xmm2
-; AVX512-NEXT:    vmovaps 64(%rdi), %xmm8
-; AVX512-NEXT:    vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm8[0]
-; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm1[1],xmm0[1]
-; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm2[1],xmm8[1]
+; AVX512-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512-NEXT:    vmovaps 64(%rdi), %xmm1
+; AVX512-NEXT:    vinsertf128 $1, 192(%rdi), %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX512-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX512-NEXT:    vmovaps 128(%rdi), %ymm2
 ; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX512-NEXT:    vmovaps 64(%rdi), %ymm11
-; AVX512-NEXT:    vmovaps (%rdi), %ymm12
-; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
+; AVX512-NEXT:    vmovaps 64(%rdi), %ymm9
+; AVX512-NEXT:    vmovaps (%rdi), %ymm10
+; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
 ; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12]
 ; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,12]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,12]
+; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm9
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
+; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm9
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [5,13]
+; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm10
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [6,14,6,14]
+; AVX512-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,14]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15]
 ; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm11
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [5,13]
-; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm12
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14]
-; AVX512-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm12
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,14]
-; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm13
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15]
-; AVX512-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512-NEXT:    vpermi2q %zmm5, %zmm6, %zmm13
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,15]
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm4, %zmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-NEXT:    vmovaps %xmm9, (%rsi)
-; AVX512-NEXT:    vmovaps %xmm7, 16(%rsi)
-; AVX512-NEXT:    vmovaps %xmm8, (%rdx)
-; AVX512-NEXT:    vmovaps %xmm10, 16(%rdx)
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT:    vmovaps %ymm7, (%rsi)
+; AVX512-NEXT:    vmovaps %ymm8, (%rdx)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rcx)
 ; AVX512-NEXT:    vmovaps %ymm1, (%r8)
 ; AVX512-NEXT:    vmovdqa %ymm2, (%r9)
-; AVX512-NEXT:    vmovdqa %ymm11, (%r11)
-; AVX512-NEXT:    vmovdqa %ymm12, (%r10)
+; AVX512-NEXT:    vmovdqa %ymm9, (%r11)
+; AVX512-NEXT:    vmovdqa %ymm10, (%r10)
 ; AVX512-NEXT:    vmovdqa %ymm3, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -721,137 +709,107 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX2-ONLY-LABEL: load_i64_stride8_vf8:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $184, %rsp
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm5[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm9[0],ymm13[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    subq $136, %rsp
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm9[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm1, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rdi), %ymm1, %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm15 = xmm2[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm2[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdi), %ymm3, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm15
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm12
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm13[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm13, %ymm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdi), %ymm12, %ymm12
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm2[1],ymm15[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm9[1],ymm2[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rdx)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rcx)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, (%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 32(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, (%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 32(%r9)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 32(%rax)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps %ymm7, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, 32(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 32(%rax)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%rax)
-; AVX2-ONLY-NEXT:    addq $184, %rsp
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 32(%rax)
+; AVX2-ONLY-NEXT:    addq $136, %rsp
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -904,71 +862,67 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[4],zmm3[6],zmm8[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7]
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
 ; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm19
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm15, %zmm19
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm8, %zmm20, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm15, %zmm17
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm19, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6]
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm9[1],zmm6[1],zmm9[3],zmm6[3],zmm9[5],zmm6[5],zmm9[7],zmm6[7]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm14, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7 {%k1}
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm16
-; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm8
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm9, %zmm8
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm9
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13]
 ; AVX512F-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm9
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm9[0],xmm8[0]
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %xmm18
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm18[0],xmm21[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm14, %ymm22, %ymm14
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm5, %zmm5
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm14, %ymm14
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm9, %ymm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm9[0],ymm14[0],ymm9[2],ymm14[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm18, %zmm5, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm18[1],xmm21[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3]
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm17, %zmm6
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm16 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm16, %zmm6
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm19, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm20, %zmm2
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm17, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm19, %zmm2
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
 ; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, (%r8)
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, (%r9)
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, (%r10)
-; AVX512F-NEXT:    vmovdqa64 %zmm9, (%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rdi)
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1022,71 +976,67 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[4],zmm3[6],zmm8[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm16
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm19
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm15, %zmm19
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm20, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm15, %zmm17
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm19, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6]
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm9[1],zmm6[1],zmm9[3],zmm6[3],zmm9[5],zmm6[5],zmm9[7],zmm6[7]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm14, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm14
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm16
-; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm8
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm8
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13]
 ; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm8
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm9
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm9[0],xmm8[0]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm18
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm18[0],xmm21[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm14, %ymm22, %ymm14
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm5, %zmm5
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm14
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm14, %ymm14
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm9, %ymm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm9[0],ymm14[0],ymm9[2],ymm14[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm18, %zmm5, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm18[1],xmm21[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3]
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm17, %zmm6
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm10, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm16 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm16, %zmm6
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm15
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm19, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm20, %zmm2
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm17, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm2
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
 ; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%r8)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%r9)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%r10)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rdi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1741,233 +1691,188 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-LABEL: load_i64_stride8_vf16:
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    subq $808, %rsp # imm = 0x328
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rdi), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 960(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 896(%rdi), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdi), %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 704(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %ymm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 640(%rdi), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm9[0],ymm3[2],ymm9[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm9[1],ymm3[3],ymm9[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, %ymm2
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, %ymm12
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, %ymm11
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm9[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 736(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm8
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm1[0],ymm11[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm7[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm8[0],ymm5[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %ymm15
 ; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %ymm4
 ; AVX2-ONLY-NEXT:    vmovaps 736(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm15[0],ymm4[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm13[1],ymm6[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm8[1],ymm5[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm3 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 64(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 80(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 112(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 96(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 64(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 80(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 96(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 112(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 48(%rdx)
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm15[1],ymm4[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 96(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 96(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -1984,41 +1889,17 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps %ymm3, 96(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 80(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 64(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 112(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 96(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 48(%r9)
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 80(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 112(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 48(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, (%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 96(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%r9)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm3, (%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -2026,471 +1907,457 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%rax)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, (%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%rax)
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm13, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, (%rax)
 ; AVX2-ONLY-NEXT:    addq $808, %rsp # imm = 0x328
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_i64_stride8_vf16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $280, %rsp # imm = 0x118
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm30
+; AVX512F-NEXT:    subq $200, %rsp
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm28
+; AVX512F-NEXT:    vmovaps 640(%rdi), %zmm0
+; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm21
 ; AVX512F-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm16
-; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm8
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm12
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm18, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm18, %zmm13
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm31, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm31, %zmm15
 ; AVX512F-NEXT:    movb $-64, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %xmm31
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm31[0],xmm1[0]
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %xmm23
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm23[0],xmm21[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm14, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm16
+; AVX512F-NEXT:    vinserti32x4 $1, 192(%rdi), %ymm16, %ymm19
+; AVX512F-NEXT:    vinserti32x4 $1, 128(%rdi), %ymm7, %ymm18
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm18[0],ymm19[0],ymm18[2],ymm19[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm18, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm8, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm18 {%k1}
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10]
-; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm3, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm3, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm31, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm29, %zmm31
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm31 {%k1}
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm7, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm7, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm20
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm22
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm24
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm24[0],ymm22[0],ymm24[2],ymm22[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm23
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm1
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm8, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm29, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
 ; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm15
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm26
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm28
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm26[0],ymm28[2],ymm26[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm13[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm3, %zmm3
-; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm29, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm29, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm25
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3]
-; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm13
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm24[1],ymm22[1],ymm24[3],ymm22[3]
-; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm27
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm29, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm8, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm1[1],ymm15[3],ymm1[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm26[1],ymm28[3],ymm26[3]
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm27
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm7, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm7, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm26
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3]
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm23, %zmm24
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm29, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm27[1],ymm25[1],ymm27[3],ymm25[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm22
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm22
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm4
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm5, %zmm14
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm1, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm6, %zmm0
-; AVX512F-NEXT:    vpermi2q %zmm19, %zmm2, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm26
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm5[0],zmm13[0],zmm5[2],zmm13[2],zmm5[4],zmm13[4],zmm5[6],zmm13[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm4, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm29[0],zmm10[0],zmm29[2],zmm10[2],zmm29[4],zmm10[4],zmm29[6],zmm10[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm21, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm28, %zmm1, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm27
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm14, %zmm1
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm28, %zmm29
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
-; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 = zmm7[1],zmm11[1],zmm7[3],zmm11[3],zmm7[5],zmm11[5],zmm7[7],zmm11[7]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm12[0],zmm10[0],zmm12[2],zmm10[2],zmm12[4],zmm10[4],zmm12[6],zmm10[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm12[1],zmm10[1],zmm12[3],zmm10[3],zmm12[5],zmm10[5],zmm12[7],zmm10[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm28, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm28, %zmm10
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm8, %zmm28
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm5, %zmm12
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 = zmm8[1],zmm16[1],zmm8[3],zmm16[3],zmm8[5],zmm16[5],zmm8[7],zmm16[7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm9[0],zmm17[0],zmm9[2],zmm17[2],zmm9[4],zmm17[4],zmm9[6],zmm17[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm9[1],zmm17[1],zmm9[3],zmm17[3],zmm9[5],zmm17[5],zmm9[7],zmm17[7]
-; AVX512F-NEXT:    vpermt2q %zmm17, %zmm14, %zmm9
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm15 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm6, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm7, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm4, %zmm1
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm5[1],zmm13[1],zmm5[3],zmm13[3],zmm5[5],zmm13[5],zmm5[7],zmm13[7]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm8, %zmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm14[1],zmm12[1],zmm14[3],zmm12[3],zmm14[5],zmm12[5],zmm14[7],zmm12[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm23, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm12, %zmm6, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm2
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm29, %zmm23
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm12
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm4, %zmm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm10[1],zmm29[3],zmm10[3],zmm29[5],zmm10[5],zmm29[7],zmm10[7]
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm8, %zmm29
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm11[1],zmm9[1],zmm11[3],zmm9[3],zmm11[5],zmm9[5],zmm11[7],zmm9[7]
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm6, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm14 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm21, %zmm10, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm16
-; AVX512F-NEXT:    vpermi2q %zmm19, %zmm0, %zmm2
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm19
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm5, %zmm9
-; AVX512F-NEXT:    vpermt2q %zmm27, %zmm1, %zmm3
-; AVX512F-NEXT:    vpermi2q %zmm30, %zmm6, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm1, %zmm6
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm1, %zmm14
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm25, %zmm15, %zmm13
-; AVX512F-NEXT:    vpermi2q %zmm16, %zmm0, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm15, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %xmm15
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %xmm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm16[0],xmm15[0]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %xmm25
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %xmm27
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm27[0],xmm25[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm17, %ymm30, %ymm17
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm17, %zmm18, %zmm17
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm29 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm31, %xmm11 # 16-byte Folded Reload
-; AVX512F-NEXT:    # xmm11 = xmm31[1],mem[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm18 = xmm23[1],xmm21[1]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm11, %ymm18, %ymm11
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm29, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm28 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm16[1],xmm15[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm15 = xmm27[1],xmm25[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm15, %ymm10
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm28, %zmm10
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm4, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm8, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm10, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm11, (%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rcx)
+; AVX512F-NEXT:    vpermi2q %zmm28, %zmm0, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm14, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm4, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm8, %zmm15
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm21, %zmm10, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm8, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm9, %zmm8
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm10, %zmm3
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm11
+; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm11, %ymm11
+; AVX512F-NEXT:    vpermi2q %zmm28, %zmm0, %zmm9
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm10, %zmm0
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm10, %ymm10
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm31, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm24 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm19[1],ymm18[3],ymm19[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm24, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm23, %zmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm12, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm5, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm29, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rcx)
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%r8)
-; AVX512F-NEXT:    vmovdqa64 %zmm26, 64(%r9)
-; AVX512F-NEXT:    vmovdqa64 %zmm24, (%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, (%r8)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 64(%r9)
+; AVX512F-NEXT:    vmovdqa64 %zmm25, (%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm19, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512F-NEXT:    addq $280, %rsp # imm = 0x118
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512F-NEXT:    addq $200, %rsp
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride8_vf16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $280, %rsp # imm = 0x118
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm30
+; AVX512BW-NEXT:    subq $200, %rsp
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovaps 640(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm21
 ; AVX512BW-NEXT:    vmovaps 512(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm16
-; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm12
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm18, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm18, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm14
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm31, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm31, %zmm15
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %xmm31
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm31[0],xmm1[0]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm23
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm23[0],xmm21[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm14, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm16
+; AVX512BW-NEXT:    vinserti32x4 $1, 192(%rdi), %ymm16, %ymm19
+; AVX512BW-NEXT:    vinserti32x4 $1, 128(%rdi), %ymm7, %ymm18
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm18[0],ymm19[0],ymm18[2],ymm19[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm18, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm8, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm18 {%k1}
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm3, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm31, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm29, %zmm31
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm31 {%k1}
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm7, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm20
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm0[0],ymm20[2],ymm0[2]
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm22
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm24
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm24[0],ymm22[0],ymm24[2],ymm22[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm23
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm8, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm29, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm15
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm26
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm28
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm26[0],ymm28[2],ymm26[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm13[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm29, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm29, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm25
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3]
-; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm13
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm24[1],ymm22[1],ymm24[3],ymm22[3]
-; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm27
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm29, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm8, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm1[1],ymm15[3],ymm1[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm26[1],ymm28[3],ymm26[3]
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %ymm25
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm27
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm7, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm7, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm26
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3]
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm15
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm23[1],ymm22[1],ymm23[3],ymm22[3]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm23, %zmm24
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm29, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm27[1],ymm25[1],ymm27[3],ymm25[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm29, %zmm22
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm22
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm4
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm14
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm1, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm6, %zmm0
-; AVX512BW-NEXT:    vpermi2q %zmm19, %zmm2, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm26
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm5[0],zmm13[0],zmm5[2],zmm13[2],zmm5[4],zmm13[4],zmm5[6],zmm13[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm4, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm29[0],zmm10[0],zmm29[2],zmm10[2],zmm29[4],zmm10[4],zmm29[6],zmm10[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm27
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm14, %zmm1
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm28, %zmm29
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 = zmm7[1],zmm11[1],zmm7[3],zmm11[3],zmm7[5],zmm11[5],zmm7[7],zmm11[7]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm12[0],zmm10[0],zmm12[2],zmm10[2],zmm12[4],zmm10[4],zmm12[6],zmm10[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm12[1],zmm10[1],zmm12[3],zmm10[3],zmm12[5],zmm10[5],zmm12[7],zmm10[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm28, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm15, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm28, %zmm10
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm8, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm12
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 = zmm8[1],zmm16[1],zmm8[3],zmm16[3],zmm8[5],zmm16[5],zmm8[7],zmm16[7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm9[0],zmm17[0],zmm9[2],zmm17[2],zmm9[4],zmm17[4],zmm9[6],zmm17[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm9[1],zmm17[1],zmm9[3],zmm17[3],zmm9[5],zmm17[5],zmm9[7],zmm17[7]
-; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm15 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm6, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm4, %zmm1
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm5[1],zmm13[1],zmm5[3],zmm13[3],zmm5[5],zmm13[5],zmm5[7],zmm13[7]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm8, %zmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm14[1],zmm12[1],zmm14[3],zmm12[3],zmm14[5],zmm12[5],zmm14[7],zmm12[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm23, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm2
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm29, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm12
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm4, %zmm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm10[1],zmm29[3],zmm10[3],zmm29[5],zmm10[5],zmm29[7],zmm10[7]
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm8, %zmm29
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm11[1],zmm9[1],zmm11[3],zmm9[3],zmm11[5],zmm9[5],zmm11[7],zmm9[7]
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm10, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm16
-; AVX512BW-NEXT:    vpermi2q %zmm19, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm5, %zmm9
-; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermi2q %zmm30, %zmm6, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm6
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm14
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm13
-; AVX512BW-NEXT:    vpermi2q %zmm16, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %xmm15
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %xmm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm16[0],xmm15[0]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %xmm25
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %xmm27
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm27[0],xmm25[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm30, %ymm17
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm17, %zmm18, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm29 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm31, %xmm11 # 16-byte Folded Reload
-; AVX512BW-NEXT:    # xmm11 = xmm31[1],mem[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm18 = xmm23[1],xmm21[1]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm11, %ymm18, %ymm11
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm29, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm28 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm16[1],xmm15[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm15 = xmm27[1],xmm25[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm15, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm28, %zmm10
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm12, %zmm0
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm8, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rcx)
+; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm0, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm14, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm4, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm8, %zmm15
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm10, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm9, %zmm8
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm3
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm11, %ymm11
+; AVX512BW-NEXT:    vpermi2q %zmm28, %zmm0, %zmm9
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm10, %zmm0
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm10, %ymm10
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm31, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm24 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm19[1],ymm18[3],ymm19[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm24, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm23, %zmm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm12, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm5, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm29, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 64(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%r8)
-; AVX512BW-NEXT:    vmovdqa64 %zmm26, 64(%r9)
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, (%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%r8)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 64(%r9)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, (%r9)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm19, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, (%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-NEXT:    addq $280, %rsp # imm = 0x118
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-NEXT:    addq $200, %rsp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <128 x i64>, ptr %in.vec, align 64
@@ -3843,331 +3710,267 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-ONLY-LABEL: load_i64_stride8_vf32:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $2408, %rsp # imm = 0x968
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1216(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps 1152(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1088(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1728(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1664(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    subq $2248, %rsp # imm = 0x8C8
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1472(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1408(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1984(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1920(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 704(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 640(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1088(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1088(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1216(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 1472(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1152(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 1408(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1728(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 1664(%rdi), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps 1984(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 1920(%rdi), %ymm15
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1984(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1920(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 1472(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 1408(%rdi), %ymm8
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm1[0],ymm13[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 1088(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 1216(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 1152(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm2[0],ymm9[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm2[1],ymm9[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 1728(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 1664(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vunpckhpd (%rsp), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1120(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1056(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1248(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1184(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1504(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1440(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1376(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1312(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 736(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1632(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1568(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1760(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1696(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1888(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1824(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2016(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1952(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
@@ -4176,9 +3979,9 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4188,9 +3991,9 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4200,9 +4003,9 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4212,9 +4015,9 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1120(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4222,152 +4025,112 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1248(%rdi), %ymm15
 ; AVX2-ONLY-NEXT:    vmovaps 1184(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1376(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 1312(%rdi), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 1504(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 1440(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 1376(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 1312(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 1504(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 1440(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1632(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 1568(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 1632(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 1568(%rdi), %ymm7
 ; AVX2-ONLY-NEXT:    vmovaps 1760(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 1696(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 1696(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1888(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovaps 1824(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovaps 2016(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovaps 1952(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm7[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm7[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm13[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm13[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rsi)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rsi)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rsi)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -4377,13 +4140,13 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -4392,71 +4155,30 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%r9)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%r9)
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%r9)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%r9)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rax)
@@ -4475,1091 +4197,1064 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rax)
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm10, 160(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm14, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rax)
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-ONLY-NEXT:    addq $2408, %rsp # imm = 0x968
+; AVX2-ONLY-NEXT:    addq $2248, %rsp # imm = 0x8C8
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_i64_stride8_vf32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $2568, %rsp # imm = 0xA08
-; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm21
-; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm24
-; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm19
-; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm31
-; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm26
-; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm27
-; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512F-NEXT:    subq $2440, %rsp # imm = 0x988
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm27
+; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm8
+; AVX512F-NEXT:    vmovdqu64 %zmm8, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm30
-; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm20
-; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm17
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm31
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm21
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    movb $-64, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512F-NEXT:    vmovdqa 1152(%rdi), %ymm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm28[0],ymm1[2],ymm28[2]
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %ymm25
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm25[0],ymm5[2],ymm25[2]
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %ymm16
+; AVX512F-NEXT:    vmovdqa 1152(%rdi), %ymm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm16[0],ymm12[2],ymm16[2]
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %ymm5
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %ymm6
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm8, %zmm0, %zmm4
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %ymm23
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %ymm17
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm17[0],ymm23[0],ymm17[2],ymm23[2]
-; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm2[0],ymm10[2],ymm2[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm4, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm7
-; AVX512F-NEXT:    vpermt2q %zmm12, %zmm0, %zmm8
+; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm2
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm7
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %ymm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm16
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm16[0],ymm4[0],ymm16[2],ymm4[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm8, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm8
-; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm12
-; AVX512F-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm12, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa 1728(%rdi), %ymm14
-; AVX512F-NEXT:    vmovdqa 1664(%rdi), %ymm12
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %ymm8
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm18[0],ymm8[2],ymm18[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3]
+; AVX512F-NEXT:    vmovdqa 192(%rdi), %ymm4
+; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm18
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3]
+; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm29
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm8
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqa 1728(%rdi), %ymm1
+; AVX512F-NEXT:    vmovdqa 1664(%rdi), %ymm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm1[0],ymm8[2],ymm1[2]
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm20
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm26[0],ymm20[0],ymm26[2],ymm20[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm24
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm28[1],ymm1[3],ymm28[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm25[1],ymm5[3],ymm25[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm23[1],ymm17[3],ymm23[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm16[1],ymm12[3],ymm16[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm5
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm9
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm4[1],ymm16[3],ymm4[3]
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm10
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm11
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm23 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm21, %zmm23, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm11
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3]
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm10, %zmm30, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm18[1],ymm8[3],ymm18[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm26[1],ymm20[1],ymm26[3],ymm20[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm25[0],zmm24[0],zmm25[2],zmm24[2],zmm25[4],zmm24[4],zmm25[6],zmm24[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm11, %zmm0, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
 ; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm14
-; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm26
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm16[0],zmm13[2],zmm16[2],zmm13[4],zmm16[4],zmm13[6],zmm16[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm19
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm13
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm18
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm20, %zmm0, %zmm29
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm31[0],zmm17[0],zmm31[2],zmm17[2],zmm31[4],zmm17[4],zmm31[6],zmm17[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm6
-; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm10[0],zmm30[2],zmm10[2],zmm30[4],zmm10[4],zmm30[6],zmm10[6]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm7, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermi2q %zmm7, %zmm2, %zmm1
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8]
-; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm18, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm30, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
-; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm23, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm31, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm25[1],zmm24[1],zmm25[3],zmm24[3],zmm25[5],zmm24[5],zmm25[7],zmm24[7]
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm24, %zmm9, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6]
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm30, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
 ; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm31[1],zmm19[1],zmm31[3],zmm19[3],zmm31[5],zmm19[5],zmm31[7],zmm19[7]
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm10, %zmm31
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm3, %zmm13
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm8, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm30, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm31, %zmm25
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm12, %zmm15
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm19 = zmm1[1],zmm19[1],zmm1[3],zmm19[3],zmm1[5],zmm19[5],zmm1[7],zmm19[7]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm23[0],zmm17[0],zmm23[2],zmm17[2],zmm23[4],zmm17[4],zmm23[6],zmm17[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm17[1],zmm23[3],zmm17[3],zmm23[5],zmm17[5],zmm23[7],zmm17[7]
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm7
-; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm18, %zmm27
-; AVX512F-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm23, %zmm7
-; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm12
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm27
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm30, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm31, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm27
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm18, %zmm27
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm23, %zmm22
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm21
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm3, %zmm21
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7]
-; AVX512F-NEXT:    vpermt2q %zmm16, %zmm10, %zmm11
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm29
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm18, %zmm16
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm23, %zmm29
-; AVX512F-NEXT:    vpermt2q %zmm30, %zmm2, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm5, %zmm7, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm3, %zmm27
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm3, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm8, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm18, %zmm31
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm20
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm23, %zmm20
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm26
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm26
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm1[1],zmm5[3],zmm1[3],zmm5[5],zmm1[5],zmm5[7],zmm1[7]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm12[0],zmm28[0],zmm12[2],zmm28[2],zmm12[4],zmm28[4],zmm12[6],zmm28[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm12[1],zmm28[1],zmm12[3],zmm28[3],zmm12[5],zmm28[5],zmm12[7],zmm28[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm24
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm18, %zmm12
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm23, %zmm24
-; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm25
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm18, %zmm25
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm6, %zmm18
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm28
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm23, %zmm28
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm6, %zmm23
-; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm30[0],zmm4[0],zmm30[2],zmm4[2],zmm30[4],zmm4[4],zmm30[6],zmm4[6]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm30[1],zmm4[1],zmm30[3],zmm4[3],zmm30[5],zmm4[5],zmm30[7],zmm4[7]
-; AVX512F-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm30
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm15
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm30, %zmm12
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm31, %zmm23
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm29
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm30, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm31, %zmm21
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm6
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm10, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm30, %zmm24
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm30
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm10, %zmm31
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm17
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm3, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm9, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm15, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm0, %zmm10, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm9, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm15, %zmm4
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm14 {%k1}
-; AVX512F-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm18, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm11
+; AVX512F-NEXT:    vpermi2q %zmm18, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm9, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm5
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm7, %zmm5
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 32-byte Folded Reload
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vpermi2q %zmm9, %zmm5, %zmm7
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm8, %zmm6
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm11
-; AVX512F-NEXT:    vpermt2q %zmm13, %zmm7, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm0, %zmm10
-; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512F-NEXT:    vpermi2q %zmm9, %zmm5, %zmm0
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm7, %zmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm7, %zmm8
-; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm31 {%k1}
-; AVX512F-NEXT:    vmovdqa 192(%rdi), %xmm14
-; AVX512F-NEXT:    vmovdqa 128(%rdi), %xmm13
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm13[0],xmm14[0]
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm7
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm10[0],xmm7[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm8, %ymm21, %ymm8
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm31, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm27 {%k1}
-; AVX512F-NEXT:    vmovdqa 704(%rdi), %xmm12
-; AVX512F-NEXT:    vmovdqa64 640(%rdi), %xmm21
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm21[0],xmm12[0]
-; AVX512F-NEXT:    vmovdqa64 576(%rdi), %xmm31
-; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm5[0],xmm31[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm30, %ymm9, %ymm9
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm18, %zmm14, %zmm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm27
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %xmm19
-; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %xmm27
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm27[0],xmm19[0]
-; AVX512F-NEXT:    vmovdqa 1088(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 1024(%rdi), %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm30, %ymm4, %ymm4
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm8
-; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %xmm25
-; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %xmm30
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm30[0],xmm25[0]
-; AVX512F-NEXT:    vmovdqa 1600(%rdi), %xmm4
-; AVX512F-NEXT:    vmovdqa 1536(%rdi), %xmm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm3[0],xmm4[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm9, %ymm6
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm18, %zmm6
-; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm4, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm16 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm27[1],xmm19[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm9, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm16, %zmm1
-; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm21[1],xmm12[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm31[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm22, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm20 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm13[1],xmm14[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm10[1],xmm7[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm5
-; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm30[1],xmm25[1]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm23, %zmm3
-; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm4 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm0
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-NEXT:    vmovups (%rsp), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm10, %zmm11, %zmm10
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-NEXT:    # ymm11 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-NEXT:    vinsertf64x4 $0, %ymm11, %zmm12, %zmm11
-; AVX512F-NEXT:    vmovdqa64 %zmm6, 192(%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm6, 64(%rsi)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm6, (%rsi)
-; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
-; AVX512F-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 192(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%rcx)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 192(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%r8)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 192(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%r9)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%r9)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 192(%rax)
-; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm8
+; AVX512F-NEXT:    vpermi2q %zmm18, %zmm14, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm4, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm4, %zmm9
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm12 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 192(%rdi), %ymm5, %ymm5
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm7, %ymm7
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm12, %zmm8
+; AVX512F-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, 704(%rdi), %ymm10, %ymm10
+; AVX512F-NEXT:    vmovdqa 512(%rdi), %xmm11
+; AVX512F-NEXT:    vinserti128 $1, 640(%rdi), %ymm11, %ymm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm16, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm14 {%k1}
+; AVX512F-NEXT:    vmovdqa 1088(%rdi), %xmm13
+; AVX512F-NEXT:    vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
+; AVX512F-NEXT:    vmovdqa 1024(%rdi), %xmm15
+; AVX512F-NEXT:    vinserti128 $1, 1152(%rdi), %ymm15, %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm14, %zmm20
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm30 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %xmm22
+; AVX512F-NEXT:    vinserti32x4 $1, 1728(%rdi), %ymm22, %ymm22
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %xmm24
+; AVX512F-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm24, %ymm24
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm26, %zmm30, %zmm16
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm14 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm25 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm25, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm23 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm23, %zmm5
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm11 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm22[1],ymm24[3],ymm22[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm11
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 64(%rax)
+; AVX512F-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm14 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT:    vmovaps %zmm1, 128(%rax)
+; AVX512F-NEXT:    vinsertf64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm15
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512F-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, 128(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 64(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm8, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm10, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm5, 128(%r9)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm27, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovaps %zmm11, 128(%rax)
-; AVX512F-NEXT:    vmovaps %zmm10, 192(%rax)
-; AVX512F-NEXT:    vmovaps %zmm9, (%rax)
-; AVX512F-NEXT:    vmovaps %zmm7, 64(%rax)
-; AVX512F-NEXT:    addq $2568, %rsp # imm = 0xA08
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 192(%rax)
+; AVX512F-NEXT:    vmovaps %zmm14, (%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 64(%rax)
+; AVX512F-NEXT:    addq $2440, %rsp # imm = 0x988
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i64_stride8_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    subq $2568, %rsp # imm = 0xA08
-; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm21
-; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm24
-; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm19
-; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm31
-; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm26
-; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm27
-; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm22
+; AVX512BW-NEXT:    subq $2440, %rsp # imm = 0x988
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm8
+; AVX512BW-NEXT:    vmovdqu64 %zmm8, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm20
-; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm21
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    movb $-64, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512BW-NEXT:    vmovdqa 1152(%rdi), %ymm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm28[0],ymm1[2],ymm28[2]
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %ymm25
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm25[0],ymm5[2],ymm25[2]
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %ymm16
+; AVX512BW-NEXT:    vmovdqa 1152(%rdi), %ymm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm16[0],ymm12[2],ymm16[2]
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %ymm5
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %ymm6
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %ymm23
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %ymm17
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm17[0],ymm23[0],ymm17[2],ymm23[2]
-; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm2
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm2[0],ymm10[2],ymm2[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm4, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm7
-; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm8
+; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm2
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm7
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm9
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %ymm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm9
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm4
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm16
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm16[0],ymm4[0],ymm16[2],ymm4[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm8, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm12
-; AVX512BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm12, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa 1728(%rdi), %ymm14
-; AVX512BW-NEXT:    vmovdqa 1664(%rdi), %ymm12
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %ymm8
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm18[0],ymm8[2],ymm18[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3]
+; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm4
+; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm14
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm18
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm19
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3]
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm29
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm8
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1728(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa 1664(%rdi), %ymm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm1[0],ymm8[2],ymm1[2]
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm20
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm26[0],ymm20[0],ymm26[2],ymm20[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm24
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm28[1],ymm1[3],ymm28[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm25[1],ymm5[3],ymm25[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm23[1],ymm17[3],ymm23[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm16[1],ymm12[3],ymm16[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
 ; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm9
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm4[1],ymm16[3],ymm4[3]
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm10
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm11
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm23 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm21, %zmm23, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm11
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3]
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm14
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm10, %zmm30, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm4
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm18[1],ymm8[3],ymm18[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm26[1],ymm20[1],ymm26[3],ymm20[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm25[0],zmm24[0],zmm25[2],zmm24[2],zmm25[4],zmm24[4],zmm25[6],zmm24[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
 ; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm14
-; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm26
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm16[0],zmm13[2],zmm16[2],zmm13[4],zmm16[4],zmm13[6],zmm16[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm18
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm29
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm31[0],zmm17[0],zmm31[2],zmm17[2],zmm31[4],zmm17[4],zmm31[6],zmm17[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm6
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm10[0],zmm30[2],zmm10[2],zmm30[4],zmm10[4],zmm30[6],zmm10[6]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm18, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm30, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm23, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm25[1],zmm24[1],zmm25[3],zmm24[3],zmm25[5],zmm24[5],zmm25[7],zmm24[7]
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm9, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6]
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
 ; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm31[1],zmm19[1],zmm31[3],zmm19[3],zmm31[5],zmm19[5],zmm31[7],zmm19[7]
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm31
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6]
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm13
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm8, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm30, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm25
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm31, %zmm25
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm15
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 = zmm1[1],zmm19[1],zmm1[3],zmm19[3],zmm1[5],zmm19[5],zmm1[7],zmm19[7]
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm23[0],zmm17[0],zmm23[2],zmm17[2],zmm23[4],zmm17[4],zmm23[6],zmm17[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm17[1],zmm23[3],zmm17[3],zmm23[5],zmm17[5],zmm23[7],zmm17[7]
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm18, %zmm27
-; AVX512BW-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm23, %zmm7
-; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm12
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm27
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm30, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm27
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm18, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm22
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm23, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm3, %zmm21
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7]
-; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm10, %zmm11
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm18, %zmm16
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm23, %zmm29
-; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm7, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm27
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm3, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm8, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm18, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm20
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm23, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm26
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm26
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm1[1],zmm5[3],zmm1[3],zmm5[5],zmm1[5],zmm5[7],zmm1[7]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm12[0],zmm28[0],zmm12[2],zmm28[2],zmm12[4],zmm28[4],zmm12[6],zmm28[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm12[1],zmm28[1],zmm12[3],zmm28[3],zmm12[5],zmm28[5],zmm12[7],zmm28[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm5
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm18, %zmm12
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm23, %zmm24
-; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm25
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm18, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm28
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm23, %zmm28
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm17
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm8 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm30[0],zmm4[0],zmm30[2],zmm4[2],zmm30[4],zmm4[4],zmm30[6],zmm4[6]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm30[1],zmm4[1],zmm30[3],zmm4[3],zmm30[5],zmm4[5],zmm30[7],zmm4[7]
-; AVX512BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm15
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm30, %zmm12
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm31, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm29
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm14 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7]
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm30, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm31, %zmm21
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm6
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm30, %zmm24
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm30
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm31
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm17
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm3, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm9, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm15, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm9, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm4
-; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm0
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm14 {%k1}
-; AVX512BW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm11
+; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm9, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm5
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm7, %zmm5
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 32-byte Folded Reload
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm5, %zmm7
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm8, %zmm6
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm14, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm11
-; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm7, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2
-; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm5, %zmm0
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm7, %zmm2
-; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm7, %zmm8
-; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm31 {%k1}
-; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm14
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm13
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm13[0],xmm14[0]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm7
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm21 = xmm10[0],xmm7[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm8, %ymm21, %ymm8
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm31, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm27 {%k1}
-; AVX512BW-NEXT:    vmovdqa 704(%rdi), %xmm12
-; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %xmm21
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm21[0],xmm12[0]
-; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %xmm31
-; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm5[0],xmm31[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm30, %ymm9, %ymm9
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm1
-; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm14, %zmm8
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm27
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %xmm19
-; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %xmm27
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm27[0],xmm19[0]
-; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %xmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm2[0]
-; AVX512BW-NEXT:    vinserti32x4 $1, %xmm30, %ymm4, %ymm4
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %xmm25
-; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %xmm30
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm30[0],xmm25[0]
-; AVX512BW-NEXT:    vmovdqa 1600(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovdqa 1536(%rdi), %xmm3
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm9, %ymm6
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm18, %zmm6
-; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm4, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm16 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm27[1],xmm19[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm16, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm22 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm21[1],xmm12[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm31[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm22, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm13[1],xmm14[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm10[1],xmm7[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm5
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm20, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm30[1],xmm25[1]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm23, %zmm3
-; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm4 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm26, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm17, %zmm0
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm7, %zmm9, %zmm7
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovups (%rsp), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm10, %zmm11, %zmm10
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-NEXT:    # ymm11 = mem[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm11, %zmm12, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm6, 64(%rsi)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm6, (%rsi)
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 192(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%rcx)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 192(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%r8)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 192(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%r9)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%r9)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 192(%rax)
-; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpermi2q %zmm18, %zmm14, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm4, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm4, %zmm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm12 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 192(%rdi), %ymm5, %ymm5
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm7
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm7, %ymm7
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm12, %zmm8
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, 704(%rdi), %ymm10, %ymm10
+; AVX512BW-NEXT:    vmovdqa 512(%rdi), %xmm11
+; AVX512BW-NEXT:    vinserti128 $1, 640(%rdi), %ymm11, %ymm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm16, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1088(%rdi), %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, 1216(%rdi), %ymm13, %ymm13
+; AVX512BW-NEXT:    vmovdqa 1024(%rdi), %xmm15
+; AVX512BW-NEXT:    vinserti128 $1, 1152(%rdi), %ymm15, %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm14, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm30 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %xmm22
+; AVX512BW-NEXT:    vinserti32x4 $1, 1728(%rdi), %ymm22, %ymm22
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %xmm24
+; AVX512BW-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm24, %ymm24
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm26, %zmm30, %zmm16
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm25, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm23, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm11 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm22[1],ymm24[3],ymm22[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm11, %zmm7
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm29, %zmm6
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm11
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rax)
+; AVX512BW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-NEXT:    vmovaps %zmm1, 128(%rax)
+; AVX512BW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm1, %zmm14
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm15
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 192(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 64(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 192(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, (%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 64(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 128(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 192(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, (%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 64(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 128(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 64(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm5, 128(%r9)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 192(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovaps %zmm11, 128(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm10, 192(%rax)
-; AVX512BW-NEXT:    vmovaps %zmm9, (%rax)
-; AVX512BW-NEXT:    vmovaps %zmm7, 64(%rax)
-; AVX512BW-NEXT:    addq $2568, %rsp # imm = 0xA08
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 192(%rax)
+; AVX512BW-NEXT:    vmovaps %zmm14, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rax)
+; AVX512BW-NEXT:    addq $2440, %rsp # imm = 0x988
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %wide.vec = load <256 x i64>, ptr %in.vec, align 64
@@ -8322,199 +8017,135 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX2-ONLY-LABEL: load_i64_stride8_vf64:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $5560, %rsp # imm = 0x15B8
-; AVX2-ONLY-NEXT:    vmovaps 2496(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2432(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3008(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps 2944(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3520(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps 3456(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2880(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps 3392(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps 3904(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 3840(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 4032(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps 3968(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3328(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2816(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2368(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2304(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1984(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1920(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1472(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1408(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    subq $5064, %rsp # imm = 0x13C8
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 960(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 896(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3776(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3712(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3648(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3584(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3264(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3200(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3136(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3072(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2752(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2688(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2624(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2560(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1728(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1664(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 704(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 640(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 832(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 960(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 768(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 896(%rdi), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1344(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 1280(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1856(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 1792(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 2368(%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2496(%rdi), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 2304(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2432(%rdi), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 2880(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3008(%rdi), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 2816(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 3392(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 3328(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 3904(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 4032(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 3840(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3968(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 576(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 704(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 512(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2240(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2176(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2112(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2048(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1216(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1152(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 640(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1088(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1024(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1152(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 1600(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 1536(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 2112(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2240(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 2048(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2176(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 2624(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2752(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 2560(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2688(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 3136(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3264(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 3072(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3200(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 3648(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3776(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 3584(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3712(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm1
@@ -8807,198 +8438,134 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdi), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 608(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 736(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 544(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 736(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 672(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 672(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 864(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 992(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 800(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 992(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 928(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 928(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1120(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1056(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1248(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1184(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1184(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1376(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1312(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1504(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1440(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1440(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1632(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1568(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 1760(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1696(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1696(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 1888(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 1824(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2016(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 1952(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 1952(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 2144(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 2080(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2272(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2208(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2208(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 2400(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2528(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 2336(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2528(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2464(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2464(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 2656(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2784(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 2592(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 2784(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2720(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2720(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 2912(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 2848(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3040(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 2976(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 2976(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 3168(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3296(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 3104(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3296(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3232(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3232(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 3424(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3552(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 3360(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3552(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3488(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3488(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 3680(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3808(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 3616(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 3808(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 3744(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 3744(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 3936(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 4064(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 3872(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 4064(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 4000(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 4000(%rdi), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
@@ -9016,7 +8583,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
@@ -9186,7 +8753,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpckhpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
@@ -9232,7 +8799,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
@@ -9286,134 +8853,70 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm4[1],ymm12[3],ymm4[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 464(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 448(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 256(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 384(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 320(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 272(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 400(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 336(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 496(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 480(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 416(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 352(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 288(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 432(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 368(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 304(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rsi)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 256(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 272(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 320(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 336(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 384(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 400(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 448(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 464(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 288(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 304(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 352(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 368(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 416(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 432(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 480(%rdx)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 496(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 320(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 256(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 416(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 352(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 288(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rsi)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 320(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 256(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 416(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 352(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 288(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rdx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rdx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -9478,144 +8981,80 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 496(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 480(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 464(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 448(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 432(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 416(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 400(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 384(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 368(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 352(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 336(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 320(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 304(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 288(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 272(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 256(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%r9)
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 496(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 480(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 464(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 448(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 432(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 416(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 400(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 384(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 368(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 352(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 336(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 320(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 304(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 288(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 272(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 256(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 240(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 208(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 176(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 160(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 144(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 112(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 80(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 48(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rax)
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 416(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 416(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 352(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 320(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 288(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 256(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%r9)
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 416(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 352(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 320(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 288(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 256(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 32(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, (%rax)
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 448(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 416(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 352(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -9654,7 +9093,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rax)
@@ -9668,9089 +9107,2151 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-ONLY-NEXT:    addq $5560, %rsp # imm = 0x15B8
+; AVX2-ONLY-NEXT:    addq $5064, %rsp # imm = 0x13C8
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-ONLY-SLOW-LABEL: load_i64_stride8_vf64:
-; AVX512F-ONLY-SLOW:       # %bb.0:
-; AVX512F-ONLY-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %al
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
-; AVX512F-ONLY-SLOW-NEXT:    retq
-;
-; AVX512F-ONLY-FAST-LABEL: load_i64_stride8_vf64:
-; AVX512F-ONLY-FAST:       # %bb.0:
-; AVX512F-ONLY-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    movb $-64, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512F-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512F-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512F-ONLY-FAST-NEXT:    vzeroupper
-; AVX512F-ONLY-FAST-NEXT:    retq
-;
-; AVX512DQ-SLOW-LABEL: load_i64_stride8_vf64:
-; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    movb $-64, %al
-; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512DQ-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512DQ-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512DQ-SLOW-NEXT:    vzeroupper
-; AVX512DQ-SLOW-NEXT:    retq
-;
-; AVX512DQ-FAST-LABEL: load_i64_stride8_vf64:
-; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    movb $-64, %al
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512DQ-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512DQ-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512DQ-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512DQ-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512DQ-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512DQ-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQ-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512DQ-FAST-NEXT:    vzeroupper
-; AVX512DQ-FAST-NEXT:    retq
-;
-; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride8_vf64:
-; AVX512BW-ONLY-SLOW:       # %bb.0:
-; AVX512BW-ONLY-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
-; AVX512BW-ONLY-SLOW-NEXT:    retq
-;
-; AVX512BW-ONLY-FAST-LABEL: load_i64_stride8_vf64:
-; AVX512BW-ONLY-FAST:       # %bb.0:
-; AVX512BW-ONLY-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
-; AVX512BW-ONLY-FAST-NEXT:    retq
-;
-; AVX512DQBW-SLOW-LABEL: load_i64_stride8_vf64:
-; AVX512DQBW-SLOW:       # %bb.0:
-; AVX512DQBW-SLOW-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
-; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512DQBW-SLOW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512DQBW-SLOW-NEXT:    # ymm16 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512DQBW-SLOW-NEXT:    vzeroupper
-; AVX512DQBW-SLOW-NEXT:    retq
-;
-; AVX512DQBW-FAST-LABEL: load_i64_stride8_vf64:
-; AVX512DQBW-FAST:       # %bb.0:
-; AVX512DQBW-FAST-NEXT:    subq $6728, %rsp # imm = 0x1A48
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3392(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3328(%rdi), %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3520(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3456(%rdi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1856(%rdi), %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1984(%rdi), %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 832(%rdi), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 960(%rdi), %zmm24
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 896(%rdi), %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    movb $-64, %al
-; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3264(%rdi), %ymm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3200(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3136(%rdi), %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3072(%rdi), %ymm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm6, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 704(%rdi), %ymm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 640(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %ymm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %ymm23
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm31
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1792(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1728(%rdi), %ymm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1664(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1600(%rdi), %ymm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1536(%rdi), %ymm26
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1472(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1408(%rdi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1344(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1280(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1216(%rdi), %ymm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1152(%rdi), %ymm29
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %ymm30
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1024(%rdi), %ymm27
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3008(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2944(%rdi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2880(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2752(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2688(%rdi), %ymm11
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2624(%rdi), %ymm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2560(%rdi), %ymm9
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2496(%rdi), %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2432(%rdi), %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2368(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2304(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2240(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2176(%rdi), %ymm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2112(%rdi), %ymm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2048(%rdi), %ymm8
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 4032(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3968(%rdi), %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3904(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3840(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3776(%rdi), %ymm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3712(%rdi), %ymm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3648(%rdi), %ymm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3584(%rdi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
-; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu (%rsp), %ymm13 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm14
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm2, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm5, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm2, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm2, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3136(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3072(%rdi), %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3264(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3200(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 704(%rdi), %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1600(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1536(%rdi), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1728(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1664(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1024(%rdi), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1216(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1152(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2624(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2560(%rdi), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2752(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2688(%rdi), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2112(%rdi), %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2048(%rdi), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2240(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2176(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3648(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3584(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3776(%rdi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3712(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm1, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm15
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm31, %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm30, %zmm3, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15]
-; AVX512DQBW-FAST-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm16, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9]
-; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm13, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm3, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm13, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm16, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm3, %zmm20
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm14, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm3, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm28, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm16, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm3, %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm3, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm16, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm3, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm16, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm3, %zmm27
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm16, %zmm31
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm3, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm29
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm26, %zmm15, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm16, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm16, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15]
-; AVX512DQBW-FAST-NEXT:    # ymm16 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm12, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm0, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm31, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm23, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %xmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm16, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm18, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 704(%rdi), %xmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 640(%rdi), %xmm20
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 576(%rdi), %xmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 512(%rdi), %xmm16
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm31, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm30, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1216(%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1152(%rdi), %xmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1088(%rdi), %xmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1024(%rdi), %xmm14
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm31, %ymm30, %ymm30
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm30, %zmm25, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1728(%rdi), %xmm30
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 1664(%rdi), %xmm31
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1600(%rdi), %xmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa 1536(%rdi), %xmm11
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm28, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm29 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2240(%rdi), %xmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2176(%rdi), %xmm21
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2112(%rdi), %xmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2048(%rdi), %xmm1
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm25, %ymm28, %ymm25
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm25, %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2752(%rdi), %xmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 2688(%rdi), %xmm29
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2624(%rdi), %xmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa 2560(%rdi), %xmm9
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm25, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm27, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3264(%rdi), %xmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3200(%rdi), %xmm27
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3136(%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3072(%rdi), %xmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm26, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm22, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3776(%rdi), %xmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 3712(%rdi), %xmm26
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3648(%rdi), %xmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa 3584(%rdi), %xmm5
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm24, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm3, %zmm24
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm17 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # xmm4 = xmm4[1],mem[1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # xmm8 = xmm8[1],mem[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm8, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm11, %ymm8
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # xmm11 = xmm11[1],mem[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm9, %ymm9
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm0, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm12, %ymm10
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 448(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 384(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rsi)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, (%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rcx)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%r8)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%r9)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%r9)
-; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    addq $6728, %rsp # imm = 0x1A48
-; AVX512DQBW-FAST-NEXT:    vzeroupper
-; AVX512DQBW-FAST-NEXT:    retq
+; AVX512F-LABEL: load_i64_stride8_vf64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    subq $6600, %rsp # imm = 0x19C8
+; AVX512F-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3328(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3520(%rdi), %zmm13
+; AVX512F-NEXT:    vmovdqa64 3456(%rdi), %zmm17
+; AVX512F-NEXT:    vmovdqa64 1856(%rdi), %zmm16
+; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 832(%rdi), %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 768(%rdi), %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 960(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 896(%rdi), %zmm12
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    movb $-64, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %ymm30
+; AVX512F-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm30[0],ymm0[2],ymm30[2]
+; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %ymm31
+; AVX512F-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm31[0],ymm3[2],ymm31[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512F-NEXT:    vmovdqa 576(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %ymm26
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm3[0],ymm26[2],ymm3[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %ymm23
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm23[0],ymm19[2],ymm23[2]
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm18
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm18[0],ymm12[2],ymm18[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm14, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %ymm25
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %ymm16
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm25[0],ymm16[2],ymm25[2]
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %ymm21
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %ymm20
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm21[0],ymm20[2],ymm21[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa 1216(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2]
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %ymm27
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %ymm28
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm28[0],ymm27[0],ymm28[2],ymm27[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512F-NEXT:    vmovdqa 2624(%rdi), %ymm10
+; AVX512F-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2496(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2432(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa 2176(%rdi), %ymm7
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
+; AVX512F-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512F-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %ymm22
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm3[0],ymm22[2],ymm3[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3968(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %ymm24
+; AVX512F-NEXT:    vmovdqa 3712(%rdi), %ymm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm24[0],ymm8[2],ymm24[2]
+; AVX512F-NEXT:    vmovdqa 3648(%rdi), %ymm1
+; AVX512F-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm2, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm30[1],ymm13[3],ymm30[3]
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm13[1],ymm31[1],ymm13[3],ymm31[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm14
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm15 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm15 = ymm26[1],mem[1],ymm26[3],mem[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm19[1],ymm23[1],ymm19[3],ymm23[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm12[1],ymm18[1],ymm12[3],ymm18[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm16[1],ymm25[1],ymm16[3],ymm25[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm21[1],ymm20[3],ymm21[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm20, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm28[1],ymm27[1],ymm28[3],ymm27[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm2, %zmm9
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm10
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm2, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm22, %ymm5 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm5 = ymm22[1],mem[1],ymm22[3],mem[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm2, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm24[1],ymm8[3],ymm24[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm4[0],zmm24[2],zmm4[2],zmm24[4],zmm4[4],zmm24[6],zmm4[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqa64 3264(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 = zmm31[0],mem[0],zmm31[2],mem[2],zmm31[4],mem[4],zmm31[6],mem[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 704(%rdi), %zmm5
+; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm26[0],zmm6[2],zmm26[2],zmm6[4],zmm26[4],zmm6[6],zmm26[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 192(%rdi), %zmm26
+; AVX512F-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm18[0],zmm12[2],zmm18[2],zmm12[4],zmm18[4],zmm12[6],zmm18[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1600(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm19
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1728(%rdi), %zmm23
+; AVX512F-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 1216(%rdi), %zmm6
+; AVX512F-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm1, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm21
+; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512F-NEXT:    vmovdqa64 2624(%rdi), %zmm29
+; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 2752(%rdi), %zmm28
+; AVX512F-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2112(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 2240(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm13[0],zmm11[0],zmm13[2],zmm11[2],zmm13[4],zmm11[4],zmm13[6],zmm11[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm5
+; AVX512F-NEXT:    vmovdqa64 3648(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3584(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqa64 3776(%rdi), %zmm4
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 3712(%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm9[0],zmm1[2],zmm9[2],zmm1[4],zmm9[4],zmm1[6],zmm9[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm22[1],zmm25[3],zmm22[3],zmm25[5],zmm22[5],zmm25[7],zmm22[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm4 = zmm19[1],mem[1],zmm19[3],mem[3],zmm19[5],mem[5],zmm19[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm4 = zmm8[1],mem[1],zmm8[3],mem[3],zmm8[5],mem[5],zmm8[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm8 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm1 = zmm1[1],mem[1],zmm1[3],mem[3],zmm1[5],mem[5],zmm1[7],mem[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512F-NEXT:    # zmm3 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm26
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm0, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm28[0],zmm29[2],zmm28[2],zmm29[4],zmm28[4],zmm29[6],zmm28[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm10
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm0, %zmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm6[0],zmm11[0],zmm6[2],zmm11[2],zmm6[4],zmm11[4],zmm6[6],zmm11[6]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm5, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm25[1],zmm2[1],zmm25[3],zmm2[3],zmm25[5],zmm2[5],zmm25[7],zmm2[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm10
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm25
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm8, %zmm10
+; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm1, %zmm24
+; AVX512F-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm5, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm1, %zmm20
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm1, %zmm24
+; AVX512F-NEXT:    vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm5, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm19
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm1, %zmm19
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm1, %zmm13
+; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm16, %zmm8, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm5, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm16
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm1, %zmm16
+; AVX512F-NEXT:    vpermt2q %zmm22, %zmm8, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm1, %zmm17
+; AVX512F-NEXT:    vpermt2q %zmm15, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm26
+; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm31
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm5, %zmm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm28[1],zmm29[3],zmm28[3],zmm29[5],zmm28[5],zmm29[7],zmm28[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm1, %zmm29
+; AVX512F-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm28, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm1, %zmm26
+; AVX512F-NEXT:    vpermt2q %zmm18, %zmm8, %zmm31
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm23
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm5, %zmm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm14[1],zmm21[1],zmm14[3],zmm21[3],zmm14[5],zmm21[5],zmm14[7],zmm21[7]
+; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm1, %zmm29
+; AVX512F-NEXT:    vpermt2q %zmm21, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm1, %zmm28
+; AVX512F-NEXT:    vpermt2q %zmm7, %zmm8, %zmm23
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm21
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm1, %zmm21
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm9, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm4, %zmm8, %zmm7
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm22
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm8, %zmm22
+; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm11, %zmm8, %zmm3
+; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpermi2q %zmm6, %zmm9, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm6, %zmm5, %zmm9
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18
+; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm27
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm14
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512F-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm18
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm15
+; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %xmm20
+; AVX512F-NEXT:    vinserti32x4 $1, 192(%rdi), %ymm20, %ymm20
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm0[0],ymm20[0],ymm0[2],ymm20[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm30, %zmm24, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa64 576(%rdi), %xmm25
+; AVX512F-NEXT:    vinserti32x4 $1, 704(%rdi), %ymm25, %ymm25
+; AVX512F-NEXT:    vmovdqa64 512(%rdi), %xmm30
+; AVX512F-NEXT:    vinserti32x4 $1, 640(%rdi), %ymm30, %ymm30
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm30[0],ymm25[0],ymm30[2],ymm25[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm27, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
+; AVX512F-NEXT:    vmovdqa64 1088(%rdi), %xmm16
+; AVX512F-NEXT:    vinserti32x4 $1, 1216(%rdi), %ymm16, %ymm16
+; AVX512F-NEXT:    vmovdqa64 1024(%rdi), %xmm27
+; AVX512F-NEXT:    vinserti32x4 $1, 1152(%rdi), %ymm27, %ymm27
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm27[0],ymm16[0],ymm27[2],ymm16[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm17, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa 1600(%rdi), %xmm9
+; AVX512F-NEXT:    vinserti128 $1, 1728(%rdi), %ymm9, %ymm9
+; AVX512F-NEXT:    vmovdqa64 1536(%rdi), %xmm19
+; AVX512F-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm19, %ymm19
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm24 = ymm19[0],ymm9[0],ymm19[2],ymm9[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm24, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vmovdqa64 %zmm29, %zmm28 {%k1}
+; AVX512F-NEXT:    vmovdqa 2112(%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, 2240(%rdi), %ymm5, %ymm5
+; AVX512F-NEXT:    vmovdqa64 2048(%rdi), %xmm24
+; AVX512F-NEXT:    vinserti32x4 $1, 2176(%rdi), %ymm24, %ymm24
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm24[0],ymm5[0],ymm24[2],ymm5[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm12
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512F-NEXT:    vmovdqa 2624(%rdi), %xmm13
+; AVX512F-NEXT:    vinserti128 $1, 2752(%rdi), %ymm13, %ymm13
+; AVX512F-NEXT:    vmovdqa64 2560(%rdi), %xmm28
+; AVX512F-NEXT:    vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm26, %zmm6
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm21 {%k1}
+; AVX512F-NEXT:    vmovdqa 3136(%rdi), %xmm7
+; AVX512F-NEXT:    vinserti128 $1, 3264(%rdi), %ymm7, %ymm7
+; AVX512F-NEXT:    vmovdqa64 3072(%rdi), %xmm26
+; AVX512F-NEXT:    vinserti32x4 $1, 3200(%rdi), %ymm26, %ymm26
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm26[0],ymm7[0],ymm26[2],ymm7[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm17, %zmm21, %zmm3
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa 3648(%rdi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, 3776(%rdi), %ymm10, %ymm10
+; AVX512F-NEXT:    vmovdqa64 3584(%rdi), %xmm17
+; AVX512F-NEXT:    vinserti32x4 $1, 3712(%rdi), %ymm17, %ymm17
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm17[0],ymm10[0],ymm17[2],ymm10[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm26[1],ymm7[1],ymm26[3],ymm7[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm4
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm25[1],ymm30[3],ymm25[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm2, %zmm7
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm20[1],ymm0[3],ymm20[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm9, %zmm2, %zmm9
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm2 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm27[1],ymm16[1],ymm27[3],ymm16[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm31, %zmm13
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm5[1],ymm24[3],ymm5[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm23, %zmm5
+; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm17[1],ymm10[1],ymm17[3],ymm10[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm10, %zmm8, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 448(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 384(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 320(%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm12, 256(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 192(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 128(%rsi)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, 64(%rsi)
+; AVX512F-NEXT:    vmovups (%rsp), %zmm1 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm1, (%rsi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 320(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 384(%rdx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovdqa64 %zmm15, 384(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-NEXT:    addq $6600, %rsp # imm = 0x19C8
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_i64_stride8_vf64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    subq $6600, %rsp # imm = 0x19C8
+; AVX512BW-NEXT:    vmovdqa64 3392(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3328(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3520(%rdi), %zmm13
+; AVX512BW-NEXT:    vmovdqa64 3456(%rdi), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 1856(%rdi), %zmm16
+; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1984(%rdi), %zmm14
+; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 832(%rdi), %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 768(%rdi), %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 960(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 896(%rdi), %zmm12
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm15
+; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 256(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    movb $-64, %al
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %ymm30
+; AVX512BW-NEXT:    vmovdqa 3200(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm30[0],ymm0[2],ymm30[2]
+; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %ymm31
+; AVX512BW-NEXT:    vmovdqa 3072(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm31[0],ymm3[2],ymm31[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa 704(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 640(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512BW-NEXT:    vmovdqa 576(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %ymm26
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm3[0],ymm26[2],ymm3[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm23
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm19
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm23[0],ymm19[2],ymm23[2]
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm18
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm12
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm18[0],ymm12[2],ymm18[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vmovdqa64 1920(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 1792(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %ymm25
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %ymm16
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm25[0],ymm16[2],ymm25[2]
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %ymm21
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %ymm20
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm21[0],ymm20[2],ymm21[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1472(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1408(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 1344(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1280(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1216(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %ymm29
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2]
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %ymm27
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %ymm28
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm28[0],ymm27[0],ymm28[2],ymm27[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3008(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2944(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 2880(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2816(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa 2752(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 2688(%rdi), %ymm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX512BW-NEXT:    vmovdqa 2624(%rdi), %ymm10
+; AVX512BW-NEXT:    vmovdqa 2560(%rdi), %ymm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2496(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2432(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 2368(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2304(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa 2240(%rdi), %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa 2176(%rdi), %ymm7
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
+; AVX512BW-NEXT:    vmovdqa 2112(%rdi), %ymm3
+; AVX512BW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %ymm22
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm3[0],ymm22[2],ymm3[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 4032(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3968(%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 3904(%rdi), %zmm1
+; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3840(%rdi), %zmm3
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %ymm24
+; AVX512BW-NEXT:    vmovdqa 3712(%rdi), %ymm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm24[0],ymm8[2],ymm24[2]
+; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %ymm1
+; AVX512BW-NEXT:    vmovdqa 3584(%rdi), %ymm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm14, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm30[1],ymm13[3],ymm30[3]
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm13[1],ymm31[1],ymm13[3],ymm31[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm14
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm15 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm15 = ymm26[1],mem[1],ymm26[3],mem[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm19[1],ymm23[1],ymm19[3],ymm23[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm12[1],ymm18[1],ymm12[3],ymm18[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm16[1],ymm25[1],ymm16[3],ymm25[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm21[1],ymm20[3],ymm21[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm28[1],ymm27[1],ymm28[3],ymm27[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm14, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm2, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm14, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm22, %ymm5 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm5 = ymm22[1],mem[1],ymm22[3],mem[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm24[1],ymm8[3],ymm24[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm4[0],zmm24[2],zmm4[2],zmm24[4],zmm4[4],zmm24[6],zmm4[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 3136(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 3264(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3200(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 = zmm31[0],mem[0],zmm31[2],mem[2],zmm31[4],mem[4],zmm31[6],mem[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 704(%rdi), %zmm5
+; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 640(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm26[0],zmm6[2],zmm26[2],zmm6[4],zmm26[4],zmm6[6],zmm26[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm26
+; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm18[0],zmm12[2],zmm18[2],zmm12[4],zmm18[4],zmm12[6],zmm18[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1600(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm19
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1728(%rdi), %zmm23
+; AVX512BW-NEXT:    vmovdqa64 1664(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 1216(%rdi), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm1, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 2624(%rdi), %zmm29
+; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 2752(%rdi), %zmm28
+; AVX512BW-NEXT:    vmovdqa64 2688(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2112(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 2240(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 2176(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm13[0],zmm11[0],zmm13[2],zmm11[2],zmm13[4],zmm11[4],zmm13[6],zmm11[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm5
+; AVX512BW-NEXT:    vmovdqa64 3648(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3584(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 3776(%rdi), %zmm4
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 3712(%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm9[0],zmm1[2],zmm9[2],zmm1[4],zmm9[4],zmm1[6],zmm9[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm22[1],zmm25[3],zmm22[3],zmm25[5],zmm22[5],zmm25[7],zmm22[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm4 = zmm19[1],mem[1],zmm19[3],mem[3],zmm19[5],mem[5],zmm19[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm4 = zmm8[1],mem[1],zmm8[3],mem[3],zmm8[5],mem[5],zmm8[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm8 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm1 = zmm1[1],mem[1],zmm1[3],mem[3],zmm1[5],mem[5],zmm1[7],mem[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 {%k1} # 64-byte Folded Reload
+; AVX512BW-NEXT:    # zmm3 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm26
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm17
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm28[0],zmm29[2],zmm28[2],zmm29[4],zmm28[4],zmm29[6],zmm28[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm10
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm6[0],zmm11[0],zmm6[2],zmm11[2],zmm6[4],zmm11[4],zmm6[6],zmm11[6]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm25[1],zmm2[1],zmm25[3],zmm2[3],zmm25[5],zmm2[5],zmm25[7],zmm2[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm10
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm25
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm10
+; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm24
+; AVX512BW-NEXT:    vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm24
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm20
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm1, %zmm24
+; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm19
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm1, %zmm19
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm8, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm16
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm16
+; AVX512BW-NEXT:    vpermt2q %zmm22, %zmm8, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm17
+; AVX512BW-NEXT:    vpermt2q %zmm15, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm26
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm31
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm28[1],zmm29[3],zmm28[3],zmm29[5],zmm28[5],zmm29[7],zmm28[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm1, %zmm29
+; AVX512BW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm28, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm26
+; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm8, %zmm31
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm28
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm23
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm14[1],zmm21[1],zmm14[3],zmm21[3],zmm14[5],zmm21[5],zmm14[7],zmm21[7]
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm29
+; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm28
+; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm8, %zmm23
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm21
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm9, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm22
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm8, %zmm22
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm8, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm9, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm5, %zmm9
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18
+; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm27
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = ymm14[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm14
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm18, %zmm18
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm24 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %xmm20
+; AVX512BW-NEXT:    vinserti32x4 $1, 192(%rdi), %ymm20, %ymm20
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm30 = ymm0[0],ymm20[0],ymm0[2],ymm20[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm30, %zmm24, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 576(%rdi), %xmm25
+; AVX512BW-NEXT:    vinserti32x4 $1, 704(%rdi), %ymm25, %ymm25
+; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %xmm30
+; AVX512BW-NEXT:    vinserti32x4 $1, 640(%rdi), %ymm30, %ymm30
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm30[0],ymm25[0],ymm30[2],ymm25[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm27, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 1088(%rdi), %xmm16
+; AVX512BW-NEXT:    vinserti32x4 $1, 1216(%rdi), %ymm16, %ymm16
+; AVX512BW-NEXT:    vmovdqa64 1024(%rdi), %xmm27
+; AVX512BW-NEXT:    vinserti32x4 $1, 1152(%rdi), %ymm27, %ymm27
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm27[0],ymm16[0],ymm27[2],ymm16[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm17, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa 1600(%rdi), %xmm9
+; AVX512BW-NEXT:    vinserti128 $1, 1728(%rdi), %ymm9, %ymm9
+; AVX512BW-NEXT:    vmovdqa64 1536(%rdi), %xmm19
+; AVX512BW-NEXT:    vinserti32x4 $1, 1664(%rdi), %ymm19, %ymm19
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm24 = ymm19[0],ymm9[0],ymm19[2],ymm9[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm24, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm28 {%k1}
+; AVX512BW-NEXT:    vmovdqa 2112(%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, 2240(%rdi), %ymm5, %ymm5
+; AVX512BW-NEXT:    vmovdqa64 2048(%rdi), %xmm24
+; AVX512BW-NEXT:    vinserti32x4 $1, 2176(%rdi), %ymm24, %ymm24
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm24[0],ymm5[0],ymm24[2],ymm5[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm12, %zmm28, %zmm12
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512BW-NEXT:    vmovdqa 2624(%rdi), %xmm13
+; AVX512BW-NEXT:    vinserti128 $1, 2752(%rdi), %ymm13, %ymm13
+; AVX512BW-NEXT:    vmovdqa64 2560(%rdi), %xmm28
+; AVX512BW-NEXT:    vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm13[0],ymm28[2],ymm13[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm26, %zmm6
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm21 {%k1}
+; AVX512BW-NEXT:    vmovdqa 3136(%rdi), %xmm7
+; AVX512BW-NEXT:    vinserti128 $1, 3264(%rdi), %ymm7, %ymm7
+; AVX512BW-NEXT:    vmovdqa64 3072(%rdi), %xmm26
+; AVX512BW-NEXT:    vinserti32x4 $1, 3200(%rdi), %ymm26, %ymm26
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm26[0],ymm7[0],ymm26[2],ymm7[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm17, %zmm21, %zmm3
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa 3648(%rdi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, 3776(%rdi), %ymm10, %ymm10
+; AVX512BW-NEXT:    vmovdqa64 3584(%rdi), %xmm17
+; AVX512BW-NEXT:    vinserti32x4 $1, 3712(%rdi), %ymm17, %ymm17
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm17[0],ymm10[0],ymm17[2],ymm10[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm22 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm26[1],ymm7[1],ymm26[3],ymm7[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm22, %zmm4
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm25[1],ymm30[3],ymm25[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm2, %zmm7
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm20[1],ymm0[3],ymm20[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm9, %zmm2, %zmm9
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm27[1],ymm16[1],ymm27[3],ymm16[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm31, %zmm13
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm5[1],ymm24[3],ymm5[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm23, %zmm5
+; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm17[1],ymm10[1],ymm17[3],ymm10[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm10, %zmm8, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 448(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 384(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 256(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 192(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 128(%rsi)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rsi)
+; AVX512BW-NEXT:    vmovups (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm1, (%rsi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 384(%rdx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rcx)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 448(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 256(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 320(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 384(%r8)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 448(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 256(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 320(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%r9)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 384(%r9)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 448(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 256(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 384(%rax)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 320(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-NEXT:    addq $6600, %rsp # imm = 0x19C8
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %wide.vec = load <512 x i64>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
   %strided.vec1 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
@@ -18778,8 +11279,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE: {{.*}}
 ; AVX2-SLOW: {{.*}}
 ; AVX512BW-FAST: {{.*}}
+; AVX512BW-ONLY-FAST: {{.*}}
+; AVX512BW-ONLY-SLOW: {{.*}}
 ; AVX512BW-SLOW: {{.*}}
+; AVX512DQ-FAST: {{.*}}
+; AVX512DQ-SLOW: {{.*}}
+; AVX512DQBW-FAST: {{.*}}
+; AVX512DQBW-SLOW: {{.*}}
 ; AVX512F-FAST: {{.*}}
+; AVX512F-ONLY-FAST: {{.*}}
+; AVX512F-ONLY-SLOW: {{.*}}
 ; AVX512F-SLOW: {{.*}}
 ; FALLBACK0: {{.*}}
 ; FALLBACK1: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index f9d8d0b5461bb0..982c07752b6b55 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -82,14 +82,14 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa (%r8), %xmm2
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
+; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21]
@@ -107,14 +107,14 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512F-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX512F-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512F-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -132,20 +132,19 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u>
-; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpextrd $2, %xmm1, 24(%rax)
-; AVX512BW-NEXT:    vmovq %xmm1, 16(%rax)
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX512BW-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512BW-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u>
+; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT:    vpextrd $2, %xmm0, 24(%rax)
+; AVX512BW-NEXT:    vmovq %xmm0, 16(%rax)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 79a9f6ab96ea18..36c79fa995dcc7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -77,16 +77,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa (%r8), %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa (%r11), %xmm3
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r11), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rax)
@@ -99,16 +99,16 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512F-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512F-NEXT:    vmovdqa (%r11), %xmm3
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512F-NEXT:    vinserti128 $1, (%r11), %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
+; AVX512F-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rax)
@@ -121,19 +121,18 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512BW-NEXT:    vmovdqa (%r11), %xmm3
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
-; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX512BW-NEXT:    vinserti128 $1, (%r11), %ymm3, %ymm3
+; AVX512BW-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
+; AVX512BW-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27]
+; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1
+; AVX512BW-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index 13142652bd4211..3c33ef722a2f7f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -309,54 +309,46 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride4_vf8:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm8
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm11
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm8[1],ymm10[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rdx), %ymm11, %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm7[0],ymm5[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm12
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm12[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm3[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm9[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm11[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 48(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 16(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 176(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 160(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 144(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 128(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 64(%r8)
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%r8)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 128(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, (%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 32(%r8)
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -717,122 +709,96 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride4_vf16:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $152, %rsp
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm6
+; AVX2-ONLY-NEXT:    pushq %rax
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm6
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm14[0],ymm11[2],ymm14[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm14[1],ymm11[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm15[0],ymm10[2],ymm15[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm0, %ymm12
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm15[1],ymm10[3],ymm15[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm9[0],ymm13[0],ymm9[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm3, %ymm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rdx), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm9[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm13
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm14
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm15
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm14[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm9[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm11
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm13
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm13
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm13
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm13
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 48(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 432(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 416(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 400(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 384(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 176(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 160(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 144(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 128(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 304(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 288(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 272(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 256(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm8, %ymm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdx), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rcx), %ymm12, %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdx), %ymm12, %ymm14
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm7[0],ymm6[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm15
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm9[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm11[0],ymm15[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm15
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm4
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm11[0],ymm4[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm11[1],ymm4[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm9[1],ymm15[1],ymm9[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 480(%r8)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 352(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 320(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 64(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 416(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 384(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 288(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 256(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r8)
-; AVX2-ONLY-NEXT:    addq $152, %rsp
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-ONLY-NEXT:    popq %rax
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -1565,266 +1531,208 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride4_vf32:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $664, %rsp # imm = 0x298
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm14
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm9
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm12
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm8[0],ymm5[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm7
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm12[1],ymm9[3],ymm12[3]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm8[1],ymm5[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm10[0],ymm7[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm6[0],ymm3[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm5
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm8[0],ymm5[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm8[1],ymm5[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm6[0],ymm3[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    subq $520, %rsp # imm = 0x208
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 48(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 16(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 816(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 800(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 784(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 768(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 944(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 928(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 912(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 896(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 688(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 672(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 656(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 640(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 432(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 416(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 400(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 384(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 176(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 160(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 144(%r8)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 128(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 304(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 288(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 272(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 256(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 560(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 544(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 528(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 512(%r8)
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm11[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm9[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %ymm13
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm9[0],ymm3[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm9[1],ymm3[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 992(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 864(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 832(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 736(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 704(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 608(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 576(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 480(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 448(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 352(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 320(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r8)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r8)
-; AVX2-ONLY-NEXT:    addq $664, %rsp # imm = 0x298
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-ONLY-NEXT:    addq $520, %rsp # imm = 0x208
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -3301,107 +3209,151 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride4_vf64:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $1688, %rsp # imm = 0x698
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm14
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm12
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    subq $1544, %rsp # imm = 0x608
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm6
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm12[1],ymm8[3],ymm12[3]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm8[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm7[0],ymm2[2],ymm7[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm4
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm1, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rdx), %ymm4, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm2, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdx), %ymm5, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 256(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 256(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 288(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 288(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 352(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 352(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3410,10 +3362,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 288(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3422,10 +3374,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3434,10 +3386,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 352(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3446,10 +3398,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3458,10 +3410,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 416(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3470,10 +3422,22 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 448(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3482,10 +3446,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 480(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 288(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
@@ -3494,320 +3458,78 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rcx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vmovaps 480(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 48(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 16(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 1584(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 1568(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 1552(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 1536(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 1840(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 1824(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 1808(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 1792(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 1968(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 1952(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 1936(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1920(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1712(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1696(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1680(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1664(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1328(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1312(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1296(%r8)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1280(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1456(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1440(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1424(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1408(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1200(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1184(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1168(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1152(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 816(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 800(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 784(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 768(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 944(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 928(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 912(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 896(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 688(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 672(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 656(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 640(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 432(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 416(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 400(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 384(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 176(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 160(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 144(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 128(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 304(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 288(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 272(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 256(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 560(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 544(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 528(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 512(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1072(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1056(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1040(%r8)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1024(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2016(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps 352(%rcx), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm13[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 416(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 448(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 480(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm13[0],ymm12[2],ymm13[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 2016(%r8)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1984(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1888(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1856(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1760(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1728(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1600(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1376(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1344(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 1888(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 1856(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 1760(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 1728(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 1632(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 1600(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 1504(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 1472(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 1376(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 1344(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -3816,7 +3538,7 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1120(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%r8)
@@ -3848,7 +3570,71 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r8)
-; AVX2-ONLY-NEXT:    addq $1688, %rsp # imm = 0x698
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1952(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1920(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1824(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1792(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1696(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1664(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1568(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1536(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1408(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1312(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r8)
+; AVX2-ONLY-NEXT:    addq $1544, %rsp # imm = 0x608
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index c22478b09a15c2..9f7c408e2a6bff 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -188,6 +188,11 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
@@ -200,13 +205,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm2
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, (%r9)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm6, 64(%r9)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, (%r9)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm5, 32(%r9)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r9)
 ; AVX2-ONLY-NEXT:    vzeroupper
@@ -396,65 +397,67 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride5_vf8:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm8
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm7[0],mem[0],ymm7[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm10
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm3
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm11
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm9, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm9
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm13
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm9[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm11, %ymm11
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rsi), %ymm12
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm13
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rsi), %ymm14
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm8[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 160(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 176(%r9)
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps %ymm11, 64(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 96(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm10, 192(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm7, 256(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 224(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 160(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 192(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 256(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 224(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, (%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%r9)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm13, 128(%r9)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm12, 288(%r9)
 ; AVX2-ONLY-NEXT:    vzeroupper
@@ -939,168 +942,161 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride5_vf16:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $312, %rsp # imm = 0x138
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm6[0],mem[0],ymm6[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm9
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm14[0],mem[0],ymm14[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm11
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm10
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm6
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm7
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm5[0],ymm9[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm9
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm3[0],mem[0],ymm3[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm11, %ymm11
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm11
+; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rsi), %ymm13
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm13 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rcx), %ymm12, %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm14
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm15
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm14, %ymm14
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm14
+; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rsi), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm15 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rsi), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rsi), %ymm8
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm8[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rsi), %ymm12
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm6
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm6[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm6[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm0
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm8[1],ymm1[1],ymm8[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rsi), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm4
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm15 = ymm4[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = mem[0,1],ymm4[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm14[2,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm4
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm6 = ymm4[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = ymm1[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm4
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm8 = ymm4[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3],ymm13[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm14[2,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm1[4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 496(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 480(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 176(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 160(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 336(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 320(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 576(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 544(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 512(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 416(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm6, 384(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, 352(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm8, 256(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r9)
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 576(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 512(%r9)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 416(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 384(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 352(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 256(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 224(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 192(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 96(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 64(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 32(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 448(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %ymm7, 608(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%r9)
-; AVX2-ONLY-NEXT:    addq $312, %rsp # imm = 0x138
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 608(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 288(%r9)
+; AVX2-ONLY-NEXT:    addq $264, %rsp # imm = 0x108
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -2084,382 +2080,362 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride5_vf32:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $1032, %rsp # imm = 0x408
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    subq $1128, %rsp # imm = 0x468
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm2, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm9
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm10
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm6
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm7
+; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm14[0],mem[0],ymm14[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm8, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm5
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm6
+; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm12[0],mem[0],ymm12[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vbroadcastsd 136(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vbroadcastsd 168(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 136(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 200(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vbroadcastsd 168(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vbroadcastsd 200(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vbroadcastsd 232(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vbroadcastsd 232(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm3[0],mem[0],ymm3[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rsi), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rsi), %ymm9
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm4[1],ymm13[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rsi), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm9[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm2
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm6[1],ymm2[1],ymm6[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 152(%rsi), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm9[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm7
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rsi), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm14[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm3[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = ymm5[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm5[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rsi), %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %ymm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm15[1],ymm0[1],ymm15[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rsi), %ymm12
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm1[1],ymm12[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 152(%rsi), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm6[1],ymm2[1],ymm6[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rsi), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm11 = ymm11[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm13
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = ymm13[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1],ymm13[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = mem[0,1],ymm11[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, (%rsp), %ymm11, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = mem[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, (%rsp) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm7
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm11 = ymm7[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm11 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm7
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm8 = ymm7[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm8 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm5
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm5[0,1],mem[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm5[2,3],mem[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 176(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm4 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm5[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm2[0],ymm6[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm4[0],ymm13[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm3[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = ymm0[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 976(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 960(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 1136(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 1120(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 816(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 800(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 496(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 480(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 176(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 160(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 336(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 320(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 656(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 640(%r9)
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm0[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = mem[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm2[0],ymm6[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 176(%rcx), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm6 = ymm0[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm9 = ymm7[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm10
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm7 = ymm10[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = mem[0,1],ymm10[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm9 = mem[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1216(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 1184(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 1152(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1120(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 1056(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 1024(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 992(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 896(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 864(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 832(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 736(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 704(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 672(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 576(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%r9)
-; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 1088(%r9)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 1248(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%r9)
-; AVX2-ONLY-NEXT:    addq $1032, %rsp # imm = 0x408
+; AVX2-ONLY-NEXT:    addq $1128, %rsp # imm = 0x468
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -4499,210 +4475,290 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride5_vf64:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $2440, %rsp # imm = 0x988
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    subq $2760, %rsp # imm = 0xAC8
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm2, %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm9
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm5
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rsi), %ymm6
+; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 32(%rcx), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm7, %ymm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm4
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rsi), %ymm5
+; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm12[0],mem[0],ymm12[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 136(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 136(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 160(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 168(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 168(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 200(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 200(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 232(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 232(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 256(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 264(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 264(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 288(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 296(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 296(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 328(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 328(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 352(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 360(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 360(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 392(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 392(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 416(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 424(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 424(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 456(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 456(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 488(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 488(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
@@ -4748,10 +4804,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vbroadcastsd 504(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm0
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm9
 ; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rsi), %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7]
@@ -4762,7 +4817,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rsi), %ymm5
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm5
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %ymm4
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
@@ -4770,56 +4825,57 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vbroadcastsd 152(%rsi), %ymm7
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rsi), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm14[1],ymm12[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rsi), %ymm7
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm13[1],ymm6[1],ymm13[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 280(%rsi), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %ymm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %ymm7
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 280(%rsi), %ymm11
+; AVX2-ONLY-NEXT:    vbroadcastsd 344(%rsi), %ymm11
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm10[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm11[1],ymm10[1],ymm11[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 344(%rsi), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %ymm12
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3]
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %ymm10
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 408(%rsi), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm14
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm11
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 448(%rcx), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm14[1],ymm0[1],ymm14[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm11[1],ymm0[1],ymm11[3],ymm0[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 472(%rsi), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vbroadcastsd 472(%rsi), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm15[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
 ; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4827,15 +4883,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm14 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
@@ -4848,11 +4904,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm1
@@ -4900,7 +4956,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
 ; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm1
@@ -4919,369 +4975,246 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = ymm2[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 256(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm2 = ymm0[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 256(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm3[0,1],mem[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 304(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 288(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 368(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 352(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 304(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 288(%r8), %ymm4
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 384(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 432(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 416(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 448(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 496(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 480(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = ymm3[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %ymm4
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 16(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, (%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1936(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 1920(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 2256(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 2240(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 2416(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 2400(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 2096(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 2080(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 1616(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 1600(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 1776(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 1760(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 1456(%r9)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 1440(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 976(%r9)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 960(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1136(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1120(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 816(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 800(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 496(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 480(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 176(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 160(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 336(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 320(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 656(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 640(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1296(%r9)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1280(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2496(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2464(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2432(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2336(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2304(%r9)
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 368(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 352(%r8), %ymm15
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm15[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm15[4,5],ymm11[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2272(%r9)
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm10[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 384(%r8), %ymm11
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm14 = ymm11[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm12 = mem[0,1],ymm11[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2176(%r9)
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 432(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 416(%r8), %ymm11
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm7 = mem[0,1],ymm11[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2144(%r9)
+; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm10[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 448(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2112(%r9)
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 496(%rcx), %ymm13
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 480(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2496(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 2464(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 2432(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2016(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2400(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 2336(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 2304(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 2272(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1984(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2240(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 2176(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 2144(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 2112(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1952(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2080(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 2016(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 1984(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 1952(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1856(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1920(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 1856(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1824(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1792(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1760(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1696(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1664(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1600(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1536(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1376(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1344(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1312(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1216(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1120(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%r9)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r9)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2368(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 2368(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2048(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -5296,8 +5229,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r9)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2528(%r9)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 2528(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2208(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -5312,7 +5244,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%r9)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%r9)
-; AVX2-ONLY-NEXT:    addq $2440, %rsp # imm = 0x988
+; AVX2-ONLY-NEXT:    addq $2760, %rsp # imm = 0xAC8
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index 664be09b5118a4..71ebeb1d24afa8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -239,9 +239,12 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm12
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm3
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3]
@@ -250,12 +253,9 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm7[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm9[0]
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 96(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, (%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm2, 160(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm5, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm6, 32(%rax)
@@ -512,97 +512,85 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride6_vf8:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    pushq %rax
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %xmm13
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = xmm9[0,0]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm2[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm3[0,1],ymm5[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %xmm5
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = xmm4[0,0]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm9
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm11
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm14 = xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%r8), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm14 = xmm8[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm13 = xmm13[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm12[0,1],ymm14[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm14 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm0
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm14[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm0[1],ymm14[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm12
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm7
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm11
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm12
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm10[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm11[1],xmm9[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm14
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = xmm5[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm7[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm3[0,1],ymm13[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm11
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm12, %ymm14, %ymm12
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm10[1],mem[1],ymm10[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm14
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm14[1],ymm0[1],ymm14[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm15[0],xmm4[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm9 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm14[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm14
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm9
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm14[1],mem[1],ymm14[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm10
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 208(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm8, 160(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 288(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 320(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 352(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm13, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 288(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 320(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 352(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 224(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 256(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-ONLY-NEXT:    popq %rax
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -1256,196 +1244,182 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride6_vf16:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $456, %rsp # imm = 0x1C8
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm8
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm4
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %xmm0
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm3
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm9[0,1],ymm3[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm15
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm8
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm15[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm10
+; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm13
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm11
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm10[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm9
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm8[0,1],ymm2[0,1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm7[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm13[1]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 40(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[0,1],ymm0[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %xmm2
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = xmm2[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 72(%r8), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 96(%r9), %xmm5
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = xmm5[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm12 = xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 104(%r8), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm5[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %xmm9
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = xmm9[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm12[0],ymm5[2],ymm12[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm4
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm12 = xmm4[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 72(%r8), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm0
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm15[1],mem[1],ymm15[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm9[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm2
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm2[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm0[0,1],ymm9[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 96(%r9), %xmm0
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = xmm0[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm12
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm1[1],xmm12[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 104(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm15, %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm5 = mem[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm7, %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm1
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm0[0],ymm13[2],ymm0[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 80(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 80(%r9), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%r8), %ymm6
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm13[0],ymm6[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm12
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm15[1],mem[1],ymm15[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r8), %ymm13
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm13[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm11 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm14 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm3 = xmm3[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm8 = xmm8[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm12, %ymm8, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm3 = mem[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm6
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 80(%rcx), %ymm12
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm6 = mem[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 80(%r9), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm10[1],mem[1],ymm10[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%r8), %ymm10
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm13
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = mem[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm12
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r8), %ymm12
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3]
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 592(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 576(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 208(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 400(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 384(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm6, 736(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 704(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 672(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 544(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm9, 480(%rax)
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 704(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 672(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 576(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 544(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 512(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 480(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 384(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm5, 352(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 320(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 288(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm15, 160(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 96(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -1462,7 +1436,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-ONLY-NEXT:    addq $456, %rsp # imm = 0x1C8
+; AVX2-ONLY-NEXT:    addq $360, %rsp # imm = 0x168
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -3580,443 +3554,402 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride6_vf32:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $1656, %rsp # imm = 0x678
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm6
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm2
+; AVX2-ONLY-NEXT:    subq $1208, %rsp # imm = 0x4B8
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm4
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %xmm0
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = xmm2[0,0]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm6[0,1],ymm4[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm3
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm6
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm3
 ; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm0[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm3[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%r8), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm5[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %xmm3
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = xmm3[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm0
 ; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 72(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 72(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 96(%r9), %xmm3
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = xmm3[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 96(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm0
 ; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 104(%r8), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 104(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[0,1],ymm0[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 128(%r9), %xmm3
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = xmm3[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 128(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rcx), %xmm14
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm0
 ; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 136(%r8), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 136(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[0,1],ymm3[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 160(%r9), %xmm5
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = xmm5[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 168(%r8), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm1[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm3[0,1],ymm5[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 192(%r9), %xmm6
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = xmm6[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm1[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 200(%r8), %ymm8
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm15
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm15[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm5[0,1],ymm6[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 224(%r9), %xmm9
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = xmm9[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm11
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 232(%r8), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = mem[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm9[0],ymm1[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm10[1],mem[1],ymm10[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm9[0],ymm1[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 80(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 80(%r9), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm10[1],mem[1],ymm10[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm9
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm12
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm13
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 160(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 160(%rcx), %xmm10
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm11
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 168(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm9[1],mem[1],ymm9[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 192(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rcx), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm7
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 200(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm4
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 144(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 144(%r9), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 224(%r9), %xmm0
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm0[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 152(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm7
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 176(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 176(%r9), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm3
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 232(%r8), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm7[1],mem[1],ymm7[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 184(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 208(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 208(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm13, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm9, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 80(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 80(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 144(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 144(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 152(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 176(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 176(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 184(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 208(%rcx), %ymm13
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm2 = mem[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 208(%r9), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm14[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm15
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 240(%r9), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm12 = mem[2,3],ymm12[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 240(%r9), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm14 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm12 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm10 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm9 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm7 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm6 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm8[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm3 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm2 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm14[2,3]
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 1168(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 1152(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 1360(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 1344(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 976(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 960(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 592(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 576(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 208(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 400(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 384(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 784(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 768(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm13, 1504(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm8, 1312(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, 1120(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 1472(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 1440(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm15, 928(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1344(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 1312(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 1280(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 1248(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 1120(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 1088(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 1056(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 928(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 896(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 864(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 736(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 544(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 352(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 160(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1408(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1376(%rax)
@@ -4048,7 +3981,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-ONLY-NEXT:    addq $1656, %rsp # imm = 0x678
+; AVX2-ONLY-NEXT:    addq $1208, %rsp # imm = 0x4B8
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -8362,23 +8295,23 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride6_vf64:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $3656, %rsp # imm = 0xE48
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm7
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm6
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    subq $2968, %rsp # imm = 0xB98
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm4
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm0
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm7
+; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[0,1],ymm2[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm6
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -8386,8 +8319,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm1
 ; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 8(%r8), %ymm2
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -8395,13 +8328,13 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %xmm0
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm7[0,1],ymm2[0,1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rcx), %xmm1
 ; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 40(%r8), %ymm2
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm2
@@ -8410,9 +8343,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1]
 ; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %xmm1
@@ -8429,14 +8362,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
 ; AVX2-ONLY-NEXT:    vmovaps 96(%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
@@ -8451,14 +8384,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm2
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
 ; AVX2-ONLY-NEXT:    vmovaps 128(%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
@@ -8473,13 +8406,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
 ; AVX2-ONLY-NEXT:    vmovaps 160(%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
@@ -8494,13 +8428,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
 ; AVX2-ONLY-NEXT:    vmovaps 192(%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
@@ -8515,13 +8450,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
 ; AVX2-ONLY-NEXT:    vmovaps 224(%r9), %xmm1
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
@@ -8536,755 +8472,681 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%r8), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 256(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 256(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %xmm2
 ; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 256(%r9), %xmm2
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = xmm2[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 264(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rcx), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 264(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 288(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 296(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 288(%r9), %xmm4
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = xmm4[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 320(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 328(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rcx), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 296(%r8), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 352(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 360(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm4[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 320(%r9), %xmm6
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = xmm6[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rcx), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 328(%r8), %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm4[0,1],ymm6[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 352(%r9), %xmm7
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = xmm7[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rcx), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 360(%r8), %ymm8
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%r8), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %xmm8
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[0,1],ymm7[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 384(%r9), %xmm8
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = xmm8[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 392(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%r8), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm8[0,1],ymm7[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 416(%r9), %xmm9
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm10 = xmm9[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %xmm2
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm0
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 384(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rcx), %xmm14
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %xmm0
+; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 392(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 416(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %xmm12
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm13
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 416(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rcx), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 424(%r8), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%r8), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm11
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 424(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %xmm8
 ; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm7[0,1],ymm9[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 448(%r9), %xmm10
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = xmm10[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rcx), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %xmm9
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 456(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%r8), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm9[0,1],ymm10[0,1]
-; AVX2-ONLY-NEXT:    vmovaps 480(%r9), %xmm11
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = xmm11[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rcx), %xmm12
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm10
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 488(%r8), %ymm12
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm10 = mem[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm10 = mem[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 80(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm10 = mem[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 80(%r9), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm10 = mem[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 144(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm10 = mem[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 144(%r9), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 152(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 176(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm15[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 176(%r9), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 184(%r8), %ymm11
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 208(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 208(%r9), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 448(%r9), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rcx), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %xmm7
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 456(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vmovaps 480(%r9), %xmm0
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = xmm0[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rcx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm3
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 488(%r8), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm13, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm9, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm12[1],mem[1],ymm12[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%r8), %ymm10
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm3[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm11
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 240(%r9), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 16(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm11[1],mem[1],ymm11[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%r8), %ymm5
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 48(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 80(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm10[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 272(%rcx), %ymm12
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 272(%r9), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 80(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm10[1],mem[1],ymm10[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 280(%r8), %ymm3
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %ymm5
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 304(%rcx), %ymm11
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 304(%r9), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 312(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm3[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 336(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 336(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 144(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 144(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 344(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 152(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 160(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm2
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 368(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 176(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 368(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 176(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 376(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 184(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm2
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 400(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vbroadcastsd 208(%rcx), %ymm4
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 400(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 208(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 408(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm2
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 432(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vbroadcastsd 240(%rcx), %ymm4
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 432(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 240(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 440(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm2
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 464(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vbroadcastsd 272(%rcx), %ymm4
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 464(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 272(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 472(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 280(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 288(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 288(%rdx), %ymm2
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 496(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vbroadcastsd 304(%rcx), %ymm4
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 496(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 304(%r9), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 504(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 312(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm15 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm13 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm12 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm10 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm9 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm8 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm7 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm6 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm5 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm4 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm3 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm2 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 336(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 336(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 344(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 352(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 368(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 368(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 376(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 400(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 400(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 408(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 432(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 432(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 440(%r8), %ymm1
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 464(%rcx), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 464(%r9), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 472(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 496(%rcx), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 496(%r9), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 504(%r8), %ymm2
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 2320(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 2304(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 2704(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 2688(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 2896(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 2880(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 2512(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 2496(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 1936(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 1920(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 2128(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 2112(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1744(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1728(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1168(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1152(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1360(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1344(%rax)
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 976(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 960(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 592(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 576(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 208(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 400(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 384(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 784(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 768(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1552(%rax)
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 1536(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, 3040(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3008(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2976(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm14, 2848(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2816(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2784(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2656(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2624(%rax)
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3040(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 3008(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 2976(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2880(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 2848(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 2816(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 2784(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2688(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 2656(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm14, 2624(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2592(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2464(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2496(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 2464(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2432(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2400(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2272(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2304(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 2272(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2240(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2208(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2080(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2112(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 2080(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2048(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2016(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1888(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1920(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 1888(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1856(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1824(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1696(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1728(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 1696(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1664(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1536(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 1504(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1312(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1344(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 1312(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1120(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 1120(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2944(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2912(%rax)
@@ -9348,7 +9210,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rax)
-; AVX2-ONLY-NEXT:    addq $3656, %rsp # imm = 0xE48
+; AVX2-ONLY-NEXT:    addq $2968, %rsp # imm = 0xB98
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 4ea0da9b7e774c..6cd85102818512 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -248,16 +248,16 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps (%r10), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%r10), %xmm3
 ; AVX2-ONLY-NEXT:    vmovaps 16(%r10), %xmm0
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm7
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm8
@@ -267,31 +267,31 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm12
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm2
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm11[2,3]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm2[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm6, %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r9), %ymm5
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm10[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 96(%rax)
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm5
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm10, %ymm10
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm11
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm11, %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm3, %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%r9), %ymm2
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 96(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm8, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 32(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rax)
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
@@ -644,101 +644,105 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-LABEL: store_i64_stride7_vf8:
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm10
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm12
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm8
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm9
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm2
 ; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm13
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm12
+; AVX2-ONLY-NEXT:    vmovaps 48(%rax), %xmm13
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %xmm15
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm9
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm6
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm14[2,3],ymm5[2,3]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm10[0],mem[0],ymm10[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm11
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm7
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm15, %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm12, %ymm15, %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 48(%rax), %xmm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm13, %ymm13
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm11, %ymm15
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
 ; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r9), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm13, %ymm9
-; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm14
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm4, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm12
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm12
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm12
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%r9), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm14[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm15[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm2, %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 16(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, (%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm7, 128(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm11, 64(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm10, 320(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 192(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 384(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm8, 256(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 32(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm6, 96(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 352(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm9, 224(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 288(%rdi)
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 128(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 64(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 320(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, (%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 384(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 256(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 32(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 96(%rcx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 224(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 416(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rcx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rcx)
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -2265,233 +2269,239 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride7_vf16:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $424, %rsp # imm = 0x1A8
+; AVX2-ONLY-NEXT:    subq $520, %rsp # imm = 0x208
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm12
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm10
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm15
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm13
+; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm0
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %xmm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 48(%rax), %xmm6
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm15[1],ymm7[3],ymm15[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm5 = ymm3[0,2,3,3]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 80(%rax), %xmm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm15
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm9
-; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm7
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm15[1],ymm3[3],ymm15[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %xmm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 48(%rax), %xmm4
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm8[1],ymm1[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm13[1],ymm5[3],ymm13[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 80(%rax), %xmm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %xmm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rax), %ymm4, %ymm4
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%r8), %xmm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 96(%rax), %ymm6, %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rcx), %ymm7
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %xmm5
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 104(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm10
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm14
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 112(%rax), %xmm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdi), %ymm11
+; AVX2-ONLY-NEXT:    vmovaps 96(%rsi), %ymm9
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rdx), %ymm6
+; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 112(%rax), %xmm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm5, %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm14
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm14, %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm7
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm8
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm12
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm15[0],ymm3[2],ymm15[2]
-; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm5
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2]
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm9[2,3],ymm15[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2]
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %ymm7
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rcx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %xmm11
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rax), %xmm9
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm15 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm15 = xmm15[1],xmm9[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %ymm1
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm13[2,3]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r9), %ymm13
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 96(%rax), %ymm0
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm6
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm5, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2]
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %ymm12
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm10
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdx), %ymm10, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rcx), %ymm0
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2]
+; AVX2-ONLY-NEXT:    vmovaps 64(%rax), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm13 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm3[0],mem[0],ymm3[2],mem[2]
+; AVX2-ONLY-NEXT:    vmovaps 64(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 64(%r9), %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%r9), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 96(%rax), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm7, %ymm7
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm8, %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm5, %ymm5
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm9, %ymm7
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm8
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm12[1],ymm6[3],ymm12[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 96(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm14[0],mem[0],ymm14[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm9
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 96(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 112(%r9), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm9
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm11
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 464(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 448(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 800(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm10, 768(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm15, 576(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 544(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 512(%rax)
-; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 352(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 320(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 832(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 736(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 704(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 800(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 768(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 576(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 544(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 512(%rax)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 448(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 352(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 320(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, (%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 832(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 736(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 704(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm2, 672(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 640(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 608(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm7, 480(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 480(%rax)
 ; AVX2-ONLY-NEXT:    vmovaps %ymm4, 416(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 384(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 288(%rax)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 256(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm3, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rax)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 224(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 192(%rax)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm13, 864(%rax)
-; AVX2-ONLY-NEXT:    addq $424, %rsp # imm = 0x1A8
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 32(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 864(%rax)
+; AVX2-ONLY-NEXT:    addq $520, %rsp # imm = 0x208
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -5253,80 +5263,95 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride7_vf32:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $1464, %rsp # imm = 0x5B8
+; AVX2-ONLY-NEXT:    subq $1672, %rsp # imm = 0x688
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm13
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm9
-; AVX2-ONLY-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm6
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm7
+; AVX2-ONLY-NEXT:    vmovups %ymm7, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm8
+; AVX2-ONLY-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm4
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %xmm4
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm10
 ; AVX2-ONLY-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm5
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm9[0],ymm13[2],ymm9[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm8 = ymm3[0],mem[0],ymm3[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm5
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm3
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm4[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm1
 ; AVX2-ONLY-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm4[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %ymm4
 ; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 48(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -5400,19 +5425,26 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 112(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 136(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %xmm14
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 128(%r8), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 128(%rax), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rax), %xmm13
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
@@ -5434,13 +5466,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rax), %xmm2
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm9
+; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %xmm8
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 168(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -5454,9 +5486,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 160(%rdx), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm8
-; AVX2-ONLY-NEXT:    vmovaps 160(%r9), %ymm7
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vmovaps 160(%r8), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps 160(%r9), %ymm6
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[2],ymm6[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5466,285 +5498,269 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 176(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdx), %ymm5, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 200(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %xmm4
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm2
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 192(%rax), %xmm5
+; AVX2-ONLY-NEXT:    vmovaps 192(%rax), %xmm2
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %ymm10
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm10[0],ymm4[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm10[1],ymm4[3],ymm10[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 208(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm6
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[0,1],ymm0[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 224(%rax), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm4
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 232(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%r8), %ymm0, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm4[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 224(%rax), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm1
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm1
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 240(%rax), %xmm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm15, %ymm15
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 96(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm15
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm14
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %xmm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 232(%rcx), %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 224(%r8), %ymm0, %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 224(%rsi), %ymm10
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm15 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm10 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm0
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 240(%rax), %xmm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm10 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, (%rsp) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 96(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm13, %ymm13
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7]
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 152(%rcx), %ymm12
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX2-ONLY-NEXT:    vbroadcastsd 160(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rcx), %ymm8
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm4, %ymm4
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm5, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%r9), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm10, %ymm2
-; AVX2-ONLY-NEXT:    vbroadcastsd 224(%rcx), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 240(%r9), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rcx), %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%r9), %ymm10
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 224(%rax), %ymm10
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm12 = mem[0,1],ymm10[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX2-ONLY-NEXT:    vbroadcastsd 160(%rcx), %ymm9
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rcx), %ymm7
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm11
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 16(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 1360(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 1344(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 464(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, 448(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 912(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 896(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm6, 1760(%rdi)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1728(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 1696(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm12, 1664(%rdi)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%rdi)
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm2, %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%r9), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rdi), %xmm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX2-ONLY-NEXT:    vbroadcastsd 224(%rcx), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 240(%r9), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%r9), %ymm5
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 224(%rax), %ymm5
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm7 = mem[0,1],ymm5[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 1760(%rcx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 1728(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1696(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 1664(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1600(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 1568(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm5, 1536(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1600(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 1568(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 1536(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1408(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm4, 1376(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm7, 1312(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1408(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 1376(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1344(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 1312(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1216(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%rdi)
-; AVX2-ONLY-NEXT:    vmovaps %ymm9, 1120(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1216(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 1120(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 1088(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 928(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 864(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 672(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%rcx)
+; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rdi)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rdi)
-; AVX2-ONLY-NEXT:    addq $1464, %rsp # imm = 0x5B8
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-ONLY-NEXT:    addq $1672, %rsp # imm = 0x688
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -11563,78 +11579,95 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX2-ONLY-LABEL: store_i64_stride7_vf64:
 ; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    subq $3624, %rsp # imm = 0xE28
+; AVX2-ONLY-NEXT:    subq $3896, %rsp # imm = 0xF38
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm5
+; AVX2-ONLY-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm6
 ; AVX2-ONLY-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm7
-; AVX2-ONLY-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm9
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm11
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm3
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm3, %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm4
 ; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[2],ymm7[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm3[0],mem[0],ymm3[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm7
+; AVX2-ONLY-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm4, %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT:    vbroadcastsd 8(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm4
+; AVX2-ONLY-NEXT:    vmovaps %xmm4, (%rsp) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovaps (%rax), %xmm8
+; AVX2-ONLY-NEXT:    vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm3
+; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm4
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %xmm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm3
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %ymm4
 ; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 16(%rax), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps 32(%rax), %xmm5
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %xmm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 40(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 32(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%rdx), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 32(%r8), %ymm2
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 32(%r9), %ymm4
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3]
-; AVX2-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 48(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 64(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 64(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 72(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -11708,6 +11741,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 112(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 128(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 128(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 136(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -11781,6 +11821,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 176(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 192(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 192(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 200(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -11840,11 +11887,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 224(%rdx), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 224(%r9), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 224(%r8), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vmovaps 224(%r9), %ymm4
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11854,6 +11901,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 240(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 256(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 256(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 256(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 264(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -11892,7 +11946,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rax), %xmm2
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vmovaps 288(%rdi), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, (%rsp) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11927,29 +11981,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 304(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 320(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 328(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %xmm7
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %xmm2
+; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%rax), %xmm2
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 320(%rax), %xmm9
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1]
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovaps 320(%rsi), %ymm3
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 320(%r9), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 320(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vmovaps 320(%r9), %ymm7
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11968,10 +12028,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %xmm5
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
 ; AVX2-ONLY-NEXT:    vbroadcastsd 360(%rcx), %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -11985,11 +12044,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 352(%rdx), %ymm0
-; AVX2-ONLY-NEXT:    vmovaps 352(%r8), %ymm4
-; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 352(%r9), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps 352(%r8), %ymm3
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vmovaps 352(%r9), %ymm4
+; AVX2-ONLY-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
 ; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11999,6 +12058,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovaps 368(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 384(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %ymm0
 ; AVX2-ONLY-NEXT:    vbroadcastsd 392(%rcx), %ymm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -12009,22 +12075,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%rax), %xmm6
+; AVX2-ONLY-NEXT:    vmovaps 384(%rax), %xmm15
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %ymm4
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
+; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps 384(%rsi), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 384(%r8), %ymm1
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 384(%r9), %ymm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 384(%r9), %ymm14
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 400(%rax), %xmm1
@@ -12032,583 +12098,523 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%r8), %xmm0
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 416(%rax), %xmm3
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 416(%rax), %xmm2
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %xmm12
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm12
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %xmm13
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 424(%rcx), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 424(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; AVX2-ONLY-NEXT:    vmovaps 416(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 416(%rsi), %ymm2
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps 416(%r8), %ymm0
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %ymm4
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm4[0],mem[0],ymm4[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
+; AVX2-ONLY-NEXT:    vmovaps 416(%rdx), %ymm3
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],mem[0],ymm3[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps 432(%rax), %xmm1
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 448(%r8), %ymm13
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm13[0,1],ymm1[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 448(%rax), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm8
-; AVX2-ONLY-NEXT:    vbroadcastsd 456(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%r8), %ymm0, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 464(%rax), %xmm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %xmm0
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rcx), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm1
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%rdx), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps 480(%r8), %ymm14
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 448(%r8), %ymm10
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm10[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 448(%rax), %ymm2
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vbroadcastsd 456(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm14[0,1],ymm1[0,1]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 480(%rax), %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 448(%r8), %ymm0, %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 448(%rsi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1]
-; AVX2-ONLY-NEXT:    vbroadcastsd 488(%rcx), %ymm4
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%r8), %ymm0, %ymm4
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %ymm5
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vmovaps 496(%rax), %xmm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm11, %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 464(%rax), %xmm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps 480(%r8), %ymm11
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm11[0,1],ymm0[0,1]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 480(%rax), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %xmm8
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1]
+; AVX2-ONLY-NEXT:    vbroadcastsd 488(%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, 480(%r8), %ymm0, %ymm3
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vmovaps 480(%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdx), %ymm6
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 496(%rax), %xmm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 24(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 32(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 56(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 96(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 88(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdi), %xmm11
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm11, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 96(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 152(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 120(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 160(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 152(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdi), %xmm9
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm9, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 160(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 184(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 224(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 216(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdi), %xmm15
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm15, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 280(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 288(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 312(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm7[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdi), %xmm7
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 224(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 344(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 248(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload
-; AVX2-ONLY-NEXT:    vbroadcastsd 352(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 376(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 280(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdi), %xmm4
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-ONLY-NEXT:    vbroadcastsd %xmm6, %ymm6
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd 288(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3]
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
-; AVX2-ONLY-NEXT:    vbroadcastsd 408(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm12, %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vbroadcastsd 416(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 440(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 440(%r9), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm8[0],mem[0],ymm8[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 464(%r9), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 472(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 472(%r9), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rax), %ymm3
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vbroadcastsd 480(%rcx), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 496(%r9), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vbroadcastsd 504(%rcx), %ymm2
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX2-ONLY-NEXT:    vbroadcastsd 504(%r9), %ymm3
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovaps 480(%rax), %ymm8
-; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
-; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 312(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 256(%rdx), %xmm0
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 128(%rdx), %xmm15
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 64(%rdx), %xmm12
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 192(%rdx), %xmm13
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 320(%rdx), %xmm6
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdi), %xmm10
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 448(%rdx), %xmm8
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps 384(%rdx), %xmm14
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
 ; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm5
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 16(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm0, (%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm14, 2704(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 2688(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 3152(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 3136(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm6, 2256(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, 2240(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 1360(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 1344(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm12, 464(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm1, 448(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm15, 912(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm11, 896(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm2, 1808(%r8)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 1792(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3552(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3520(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3488(%r8)
-; AVX2-ONLY-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3456(%r8)
-; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3424(%r8)
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm9, %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3392(%r8)
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 344(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vbroadcastsd 352(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3360(%r8)
+; AVX2-ONLY-NEXT:    vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 376(%rcx), %ymm1
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX2-ONLY-NEXT:    vbroadcastsd %xmm15, %ymm15
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3328(%r8)
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm15 = ymm0[1],ymm14[1],ymm0[3],ymm14[3]
+; AVX2-ONLY-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 408(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm12[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX2-ONLY-NEXT:    vbroadcastsd 416(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 440(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 440(%r9), %ymm12
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm10[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 464(%r9), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 472(%rcx), %ymm10
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 472(%r9), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 448(%rax), %ymm14
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rdi), %xmm14
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
+; AVX2-ONLY-NEXT:    vinsertf128 $1, %xmm8, %ymm14, %ymm8
+; AVX2-ONLY-NEXT:    vbroadcastsd 480(%rcx), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 496(%r9), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-ONLY-NEXT:    vbroadcastsd 504(%rcx), %ymm11
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
+; AVX2-ONLY-NEXT:    vbroadcastsd 504(%r9), %ymm14
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-ONLY-NEXT:    vmovaps 480(%rax), %ymm14
+; AVX2-ONLY-NEXT:    vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7]
+; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7]
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 3552(%rcx)
+; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 3520(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm6, 3488(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3456(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3296(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3424(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3264(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3392(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 3360(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 3328(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3232(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3296(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm2, 3264(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm1, 3232(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3200(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3200(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3168(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3168(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3104(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3136(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, 3104(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3072(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3072(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3040(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3040(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3008(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 3008(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2976(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2976(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2944(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2944(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm13, 2912(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm15, 2880(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2912(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2848(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2880(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2816(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2848(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2784(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2816(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2752(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm3, 2720(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2784(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2688(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 2656(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2752(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2624(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2720(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2592(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2656(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2560(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2624(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2528(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2592(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2496(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 2464(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm7, 2432(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2560(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2400(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2528(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2368(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2496(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2336(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2464(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2304(%rcx)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 2272(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2432(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2240(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2400(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2208(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2368(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2176(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2336(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2144(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2304(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2112(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2272(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2080(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2208(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2048(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2176(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2016(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2144(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1984(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2112(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1952(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2080(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1920(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2048(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1888(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 2016(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1856(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1984(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1824(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1952(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1792(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1920(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1760(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1888(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1728(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1856(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1696(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1824(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1664(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1760(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1728(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1600(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1696(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1568(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1664(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1536(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1632(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1600(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1568(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1536(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1408(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1504(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1376(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1472(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1344(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1440(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1312(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1408(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1376(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1312(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1216(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1280(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1248(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1216(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1120(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1184(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1152(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1120(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1088(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1056(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 1024(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 992(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 896(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 960(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 928(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 864(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 832(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 800(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 768(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 736(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 704(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 672(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 640(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 608(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 576(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 544(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 512(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 448(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 480(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 416(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 384(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 352(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 320(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 288(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 256(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 224(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 192(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 96(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 64(%r8)
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%rcx)
 ; AVX2-ONLY-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
-; AVX2-ONLY-NEXT:    addq $3624, %rsp # imm = 0xE28
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, (%rcx)
+; AVX2-ONLY-NEXT:    addq $3896, %rsp # imm = 0xF38
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
index 083c206fe9356e..efb3a98dc68d71 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll
@@ -273,54 +273,46 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm3
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm4
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm5
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm1
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm6
-; AVX2-ONLY-NEXT:    vmovaps (%r11), %ymm7
-; AVX2-ONLY-NEXT:    vmovaps (%r10), %ymm8
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm6[0],ymm1[2],ymm6[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm7 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
-; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm0
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %ymm1
+; AVX2-ONLY-NEXT:    vmovaps (%rdx), %ymm2
+; AVX2-ONLY-NEXT:    vmovaps (%rcx), %ymm3
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %ymm4
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %ymm5
+; AVX2-ONLY-NEXT:    vmovaps (%r11), %ymm6
+; AVX2-ONLY-NEXT:    vmovaps (%r10), %ymm7
+; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm8
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%r10), %ymm8, %ymm8
+; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%r11), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm9
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm9, %ymm9
+; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm11
+; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rdx), %ymm11, %ymm11
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[2],ymm7[2]
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3]
 ; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3]
-; AVX2-ONLY-NEXT:    vmovaps (%r9), %xmm3
-; AVX2-ONLY-NEXT:    vmovaps (%r8), %xmm4
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vmovaps (%r10), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps (%r11), %xmm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0]
-; AVX2-ONLY-NEXT:    vmovaps (%rsi), %xmm7
-; AVX2-ONLY-NEXT:    vmovaps (%rdi), %xmm8
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm10 = xmm8[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vmovaps (%rcx), %xmm11
-; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm12
-; AVX2-ONLY-NEXT:    vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm11[1]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0]
-; AVX2-ONLY-NEXT:    vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm11[0]
-; AVX2-ONLY-NEXT:    vmovaps %xmm8, 16(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm7, (%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm13, 80(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm10, 64(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm4, 48(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm3, 32(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm9, 112(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %xmm5, 96(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm2, 128(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm6, 192(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm1, 224(%rax)
-; AVX2-ONLY-NEXT:    vmovaps %ymm0, 160(%rax)
+; AVX2-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-ONLY-NEXT:    vmovaps %ymm0, 128(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm5, 192(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm4, 224(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm11, 160(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm9, 64(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm12, (%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm8, 96(%rax)
+; AVX2-ONLY-NEXT:    vmovaps %ymm10, 32(%rax)
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
@@ -807,69 +799,65 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    movb $-64, %r8b
 ; AVX512F-NEXT:    kmovw %r8d, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa (%rcx), %xmm10
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm11[1],xmm10[1]
-; AVX512F-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm14
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm12[1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm13, %ymm5
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm16
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm5, %ymm10
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm5
+; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm5, %ymm12
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm17
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
 ; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
-; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm5
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm11, %zmm5
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm13
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
+; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm11
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
 ; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm15
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm13, %zmm14
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
 ; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
-; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm4
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm13
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,4,12,4,12,4,12]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm4, %zmm15
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
-; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512F-NEXT:    vpermi2q %zmm9, %zmm6, %zmm4
-; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512F-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm17, %zmm4
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13]
-; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm15, %zmm17
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
-; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
-; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm6
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm14, %zmm15
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm14, %zmm6
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
 ; AVX512F-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2q %zmm8, %zmm7, %zmm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm18, %zmm6
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm6, %zmm16, %zmm6
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm8
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm0, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm11[0],xmm10[0]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm14[0],xmm12[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
@@ -879,29 +867,29 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqa (%rcx), %ymm9
 ; AVX512F-NEXT:    vmovdqa (%rdx), %ymm10
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
-; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
-; AVX512F-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm8
-; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,3,11,3,11,3,11]
-; AVX512F-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpermt2q %zmm3, %zmm11, %zmm1
-; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm14
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm15
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm4, %zmm8, %zmm4
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm3, %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm2, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
-; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 128(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, 320(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm13, 256(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm11, 448(%rax)
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512F-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 64(%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -926,69 +914,65 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movb $-64, %r8b
 ; AVX512BW-NEXT:    kmovd %r8d, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm10
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm11[1],xmm10[1]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm12
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm14
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm12[1]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm13, %ymm5
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm16
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm5, %ymm10
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm5, %ymm12
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm17
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm11
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6]
 ; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm5
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
-; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm13, %zmm5
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm11, %zmm5
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm13
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
+; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm11
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm13, %zmm11
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm15
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm14
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15 {%k1}
 ; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm13
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
-; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm4
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm13
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm15, %zmm13
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm15
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm15
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6]
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpermi2q %zmm9, %zmm6, %zmm4
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm15
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm17, %zmm4
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm17
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm18 {%k1}
-; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm6
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm15
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k1}
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm6
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13]
 ; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm8, %zmm7, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm18, %zmm6
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm6, %zmm16, %zmm6
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm8
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm11[0],xmm10[0]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm14[0],xmm12[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm8, %zmm7, %zmm7
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
@@ -998,29 +982,29 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm9
 ; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm10
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm12
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2]
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
-; AVX512BW-NEXT:    vinserti64x4 $0, %ymm11, %zmm8, %zmm8
-; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm1
-; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm14
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm15
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm4, %zmm8, %zmm4
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3]
-; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
 ; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 448(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64
@@ -1904,1709 +1888,419 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-ONLY-NEXT:    vzeroupper
 ; AVX2-ONLY-NEXT:    retq
 ;
-; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf16:
-; AVX512F-ONLY-SLOW:       # %bb.0:
-; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %al
-; AVX512F-ONLY-SLOW-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
-; AVX512F-ONLY-SLOW-NEXT:    retq
-;
-; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf16:
-; AVX512F-ONLY-FAST:       # %bb.0:
-; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    movb $-64, %al
-; AVX512F-ONLY-FAST-NEXT:    kmovw %eax, %k1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512F-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512F-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vzeroupper
-; AVX512F-ONLY-FAST-NEXT:    retq
-;
-; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf16:
-; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512DQ-SLOW-NEXT:    movb $-64, %al
-; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512DQ-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512DQ-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm19 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512DQ-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vzeroupper
-; AVX512DQ-SLOW-NEXT:    retq
-;
-; AVX512DQ-FAST-LABEL: store_i64_stride8_vf16:
-; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512DQ-FAST-NEXT:    movb $-64, %al
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512DQ-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm19 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512DQ-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vzeroupper
-; AVX512DQ-FAST-NEXT:    retq
-;
-; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf16:
-; AVX512BW-ONLY-SLOW:       # %bb.0:
-; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %al
-; AVX512BW-ONLY-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
-; AVX512BW-ONLY-SLOW-NEXT:    retq
-;
-; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf16:
-; AVX512BW-ONLY-FAST:       # %bb.0:
-; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %al
-; AVX512BW-ONLY-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
-; AVX512BW-ONLY-FAST-NEXT:    retq
+; AVX512F-LABEL: store_i64_stride8_vf16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm25
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm0
+; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm26
+; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm17
+; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm6
+; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm16
+; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm31
+; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512F-NEXT:    vmovdqa64 (%r11), %zmm8
+; AVX512F-NEXT:    vmovdqa64 64(%r11), %zmm30
+; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm9
+; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm29
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9]
+; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm18, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm18, %zmm4
+; AVX512F-NEXT:    movb $-64, %r8b
+; AVX512F-NEXT:    kmovw %r8d, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512F-NEXT:    vinserti128 $1, (%rdx), %ymm10, %ymm10
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm13, %zmm4, %zmm2
+; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm23, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm23, %zmm13
+; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm21
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm1
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512F-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm14, %zmm13
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm22
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12]
+; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm15, %zmm13
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm1, %zmm5
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm24
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15]
+; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm13, %zmm5
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm13, %zmm2
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm4, %zmm7
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm20
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14]
+; AVX512F-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm28, %zmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6]
+; AVX512F-NEXT:    vpermt2q %zmm19, %zmm28, %zmm3
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
+; AVX512F-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpermt2q %zmm17, %zmm7, %zmm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm18, %zmm0
+; AVX512F-NEXT:    vpermi2q %zmm27, %zmm16, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512F-NEXT:    vmovdqa 64(%rsi), %xmm0
+; AVX512F-NEXT:    vinserti128 $1, 64(%rcx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512F-NEXT:    vinserti128 $1, 64(%rdx), %ymm2, %ymm2
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm18
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm23, %zmm3
+; AVX512F-NEXT:    vpermi2q %zmm27, %zmm16, %zmm23
+; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm19
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm11, %zmm10
+; AVX512F-NEXT:    vpermi2q %zmm26, %zmm12, %zmm14
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm2
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm15, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6]
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm11, %zmm15
+; AVX512F-NEXT:    vpermi2q %zmm26, %zmm12, %zmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7]
+; AVX512F-NEXT:    vpermi2q %zmm25, %zmm11, %zmm13
+; AVX512F-NEXT:    vpermi2q %zmm26, %zmm12, %zmm4
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm28, %zmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512F-NEXT:    vpermt2q %zmm25, %zmm28, %zmm11
+; AVX512F-NEXT:    vpermt2q %zmm26, %zmm7, %zmm12
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11]
+; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm3, %zmm10
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512F-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512F-NEXT:    vmovdqa 64(%rcx), %ymm11
+; AVX512F-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512F-NEXT:    vmovdqa64 64(%rsi), %ymm23
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %ymm25
+; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm26
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
+; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpermt2q %zmm9, %zmm10, %zmm8
+; AVX512F-NEXT:    vpermt2q %zmm31, %zmm10, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm6, %zmm6
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512F-NEXT:    vpermi2q %zmm27, %zmm16, %zmm3
+; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
+; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm3, %zmm3
+; AVX512F-NEXT:    vpermt2q %zmm29, %zmm10, %zmm30
+; AVX512F-NEXT:    vpermt2q %zmm27, %zmm10, %zmm16
+; AVX512F-NEXT:    vmovdqa64 %zmm30, %zmm16 {%k1}
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2]
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
+; AVX512F-NEXT:    vinserti64x4 $0, %ymm7, %zmm16, %zmm7
+; AVX512F-NEXT:    vmovdqa64 %zmm7, 640(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 704(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, 896(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 960(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 768(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 832(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm18, 576(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm17, 384(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm20, 448(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm22, 320(%rax)
+; AVX512F-NEXT:    vmovdqa64 %zmm21, (%rax)
+; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
 ;
-; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf16:
-; AVX512DQBW-SLOW:       # %bb.0:
-; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512DQBW-SLOW-NEXT:    movb $-64, %al
-; AVX512DQBW-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-SLOW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512DQBW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm19 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512DQBW-SLOW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vzeroupper
-; AVX512DQBW-SLOW-NEXT:    retq
-;
-; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf16:
-; AVX512DQBW-FAST:       # %bb.0:
-; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm31
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm29
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
-; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm14, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm14, %zmm3
-; AVX512DQBW-FAST-NEXT:    movb $-64, %al
-; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %xmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %xmm17
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm10, %ymm20, %ymm10
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm3, %zmm21
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm20, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm20, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0]
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm22
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm3, %zmm2
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm6, %zmm23
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm6, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm2
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm24
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15]
-; AVX512DQBW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm10, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm10, %zmm13
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15]
-; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm15, %zmm25
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm26, %zmm7
-; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm19 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm19, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm4, %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm14, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %xmm9
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm13, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm14, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm20, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm20 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm13
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm14
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm3, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm7, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm9, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm10, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7]
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm16, %zmm13, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm14, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm26, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm19, %zmm14
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm7, %zmm4
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11]
-; AVX512DQBW-FAST-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm7, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm7, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %ymm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %ymm11
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %ymm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm14
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm10, %zmm10
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10]
-; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm31, %zmm12, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm11
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm5, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm7, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm27, %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %ymm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %ymm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm27, %zmm12, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm7, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 640(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 704(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 896(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 960(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 832(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 512(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, 576(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 384(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 320(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, (%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vzeroupper
-; AVX512DQBW-FAST-NEXT:    retq
+; AVX512BW-LABEL: store_i64_stride8_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm11
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm25
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm19
+; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm12
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm26
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm17
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm6
+; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm16
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm31
+; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm27
+; AVX512BW-NEXT:    vmovdqa64 (%r11), %zmm8
+; AVX512BW-NEXT:    vmovdqa64 64(%r11), %zmm30
+; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm9
+; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm29
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm18, %zmm4
+; AVX512BW-NEXT:    movb $-64, %r8b
+; AVX512BW-NEXT:    kmovd %r8d, %k1
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512BW-NEXT:    vinserti128 $1, (%rdx), %ymm10, %ymm10
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm13, %zmm4, %zmm2
+; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm23, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm23, %zmm13
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm13 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm21
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm1
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm10, %zmm1
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512BW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm13
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm1, %zmm4, %zmm22
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm1
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm15, %zmm13
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm5
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm4, %zmm24
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm5
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm13, %zmm5
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm13, %zmm2
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15]
+; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm4, %zmm7
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm5, %zmm20
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT:    # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm2
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm28, %zmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6]
+; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm28, %zmm3
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14]
+; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm18, %zmm0
+; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm16, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa 64(%rsi), %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rcx), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm2
+; AVX512BW-NEXT:    vinserti128 $1, 64(%rdx), %ymm2, %ymm2
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm18, %zmm18
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm23, %zmm3
+; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm16, %zmm23
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm19
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm0
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm11, %zmm10
+; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm12, %zmm14
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6]
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm11, %zmm15
+; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm12, %zmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm13, %zmm0
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7]
+; AVX512BW-NEXT:    vpermi2q %zmm25, %zmm11, %zmm13
+; AVX512BW-NEXT:    vpermi2q %zmm26, %zmm12, %zmm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm4
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm28, %zmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6]
+; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm28, %zmm11
+; AVX512BW-NEXT:    vpermt2q %zmm26, %zmm7, %zmm12
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm10
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm10
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512BW-NEXT:    vmovdqa 64(%rcx), %ymm11
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3]
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %ymm23
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm25
+; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm26
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm5, %zmm10, %zmm5
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm10, %zmm8
+; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm10, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm6, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm7
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm3, %zmm7
+; AVX512BW-NEXT:    vpermi2q %zmm27, %zmm16, %zmm3
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3]
+; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm10, %zmm30
+; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm10, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm16 {%k1}
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2]
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2]
+; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3]
+; AVX512BW-NEXT:    vinserti64x4 $0, %ymm7, %zmm16, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 640(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 704(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 896(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 960(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 768(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 832(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 512(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 576(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 448(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, (%rax)
+; AVX512BW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64
   %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64
   %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64
@@ -5450,7 +4144,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
-; AVX512F-ONLY-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-SLOW-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -5458,75 +4152,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %r11d, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
@@ -5534,12 +4228,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -5547,383 +4241,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -5932,7 +4609,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -5947,13 +4625,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-SLOW-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32:
 ; AVX512F-ONLY-FAST:       # %bb.0:
-; AVX512F-ONLY-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-FAST-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -5961,75 +4639,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512F-ONLY-FAST-NEXT:    movb $-64, %r11b
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %r11d, %k1
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
@@ -6037,12 +4715,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -6050,383 +4728,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512F-ONLY-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -6435,7 +5096,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -6450,13 +5112,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512F-ONLY-FAST-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512F-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512F-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-SLOW-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -6464,75 +5126,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512DQ-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512DQ-SLOW-NEXT:    # ymm28 = mem[0,1,0,1]
@@ -6540,12 +5202,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512DQ-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -6553,383 +5215,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -6938,7 +5583,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -6953,13 +5599,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQ-SLOW-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-FAST-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -6967,75 +5613,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512DQ-FAST-NEXT:    movb $-64, %r11b
 ; AVX512DQ-FAST-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512DQ-FAST-NEXT:    # ymm28 = mem[0,1,0,1]
@@ -7043,12 +5689,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512DQ-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -7056,383 +5702,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -7441,7 +6070,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -7456,13 +6086,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQ-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQ-FAST-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32:
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
-; AVX512BW-ONLY-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-SLOW-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -7470,75 +6100,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
@@ -7546,12 +6176,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -7559,383 +6189,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -7944,7 +6557,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -7959,13 +6573,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-SLOW-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32:
 ; AVX512BW-ONLY-FAST:       # %bb.0:
-; AVX512BW-ONLY-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-FAST-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -7973,75 +6587,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %r11b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r11d, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
@@ -8049,12 +6663,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -8062,383 +6676,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512BW-ONLY-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -8447,7 +7044,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -8462,13 +7060,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512BW-ONLY-FAST-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512BW-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32:
 ; AVX512DQBW-SLOW:       # %bb.0:
-; AVX512DQBW-SLOW-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-SLOW-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQBW-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -8476,75 +7074,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512DQBW-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r11d, %k1
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512DQBW-SLOW-NEXT:    # ymm28 = mem[0,1,0,1]
@@ -8552,12 +7150,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512DQBW-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -8565,383 +7163,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512DQBW-SLOW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512DQBW-SLOW-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -8950,7 +7531,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -8965,13 +7547,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-SLOW-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-SLOW-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512DQBW-SLOW-NEXT:    vzeroupper
 ; AVX512DQBW-SLOW-NEXT:    retq
 ;
 ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32:
 ; AVX512DQBW-FAST:       # %bb.0:
-; AVX512DQBW-FAST-NEXT:    subq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-FAST-NEXT:    subq $2568, %rsp # imm = 0xA08
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQBW-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
@@ -8979,75 +7561,75 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm16
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovaps 192(%rdx), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovaps 128(%rdx), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovaps 192(%rdx), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovaps 128(%rdx), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm1, (%rsp) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm20
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm18
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm22
 ; AVX512DQBW-FAST-NEXT:    movb $-64, %r11b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r11d, %k1
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm12, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm7
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13]
 ; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm7
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm9, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm15, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6]
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm12, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm9
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm11 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm9
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm7, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15]
 ; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15]
 ; AVX512DQBW-FAST-NEXT:    # ymm28 = mem[0,1,0,1]
@@ -9055,12 +7637,12 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm13, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm13, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12]
 ; AVX512DQBW-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -9068,383 +7650,366 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm15, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm12, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm2, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm26, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm28, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm11, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm3, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, (%rsp) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm11
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm28, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm13, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm14, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm15, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm26, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r10), %zmm28
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm15, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm24
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r10), %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm14
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm31
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm15, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm26, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm19
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm27
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8]
+; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm27
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm5, %zmm30
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
-; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10]
 ; AVX512DQBW-FAST-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11]
-; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm5, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm6, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm26, %zmm21
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm29, %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm29
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm26, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm0, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm26, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm17
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm20
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm26, %zmm20
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm25
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm0, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm26, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm30, %zmm2, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm26, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm26, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm26, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm0, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm6, %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm26, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm13, %zmm0, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm26, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm11
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm5, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm26, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm23
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm31
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm24 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm30, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm24, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm1, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm26, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm22 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm7 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %xmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm1
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm29
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm2, %ymm22, %ymm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm2, %zmm23
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm31, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm2, %ymm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm25, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm17, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm2
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm2, %ymm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm7, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm8, %zmm30
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %xmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm8 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rsi), %xmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %xmm8
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm4, %ymm19, %ymm4
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm9, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm15, %zmm24
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm7, %ymm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %xmm14
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 128(%rdx), %ymm14, %ymm14
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm8, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm15, %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm10, %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm12, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm24, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm19, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm0 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rsi), %xmm11
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 192(%rcx), %ymm11, %ymm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm17
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm5 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm14, %zmm22
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm5, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm7, %zmm17, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm4, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm8, %zmm0, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm27, %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %xmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %xmm14
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm19
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm15, %ymm25, %ymm15
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm5, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm6 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm14, %ymm13
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm6, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm11, %zmm0, %zmm11
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm10, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm11, %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm13, %zmm0, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512DQBW-FAST-NEXT:    # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm14, %zmm0, %zmm14
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm27
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %ymm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %ymm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %ymm19
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %ymm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %ymm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %ymm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %ymm20
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm8, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm20 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %ymm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm19
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %ymm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %ymm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %ymm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %ymm19
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm22, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm10, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %ymm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %ymm14
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %ymm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %ymm18
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %ymm19
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm16 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm16, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %ymm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %ymm15
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm16, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm26 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %ymm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %ymm14
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %ymm16
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm26, %zmm12
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm31, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm31, %zmm10
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 1728(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 1664(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 1216(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 1152(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 704(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 640(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm14, 1984(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm13, 1920(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 1856(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1792(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 1600(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 1536(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1472(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm8, 1408(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 1344(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 1280(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1088(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 1024(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 1728(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 1664(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1984(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm11, 1920(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 1856(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 1792(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 1600(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 1536(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1408(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 1344(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1024(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -9453,7 +8018,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 576(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 512(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -9468,7 +8034,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 64(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, (%rax)
-; AVX512DQBW-FAST-NEXT:    addq $2632, %rsp # imm = 0xA48
+; AVX512DQBW-FAST-NEXT:    addq $2568, %rsp # imm = 0xA08
 ; AVX512DQBW-FAST-NEXT:    vzeroupper
 ; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64
@@ -13251,600 +11817,613 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
-; AVX512F-ONLY-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-SLOW-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512F-ONLY-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %r11d, %k1
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512F-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512F-ONLY-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512F-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -13857,19 +12436,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -13881,338 +12461,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512F-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512F-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14221,8 +12771,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14231,8 +12781,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14241,8 +12791,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14251,8 +12801,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -14261,608 +12811,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512F-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512F-ONLY-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-SLOW-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512F-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512F-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64:
 ; AVX512F-ONLY-FAST:       # %bb.0:
-; AVX512F-ONLY-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-FAST-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    movb $-64, %r11b
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %r11d, %k1
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512F-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512F-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512F-ONLY-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512F-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512F-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512F-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512F-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512F-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -14875,19 +13438,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -14899,338 +13463,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512F-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512F-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -15239,8 +13773,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -15249,8 +13783,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -15259,8 +13793,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -15269,8 +13803,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -15279,608 +13813,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512F-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512F-ONLY-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512F-ONLY-FAST-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512F-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512F-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQ-SLOW-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512DQ-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512DQ-SLOW-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512DQ-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512DQ-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512DQ-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512DQ-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512DQ-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512DQ-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512DQ-SLOW-NEXT:    # ymm27 = mem[0,1,0,1]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512DQ-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -15893,19 +14440,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -15917,338 +14465,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512DQ-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512DQ-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512DQ-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512DQ-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512DQ-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512DQ-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQ-SLOW-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512DQ-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512DQ-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16257,8 +14775,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16267,8 +14785,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16277,8 +14795,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16287,8 +14805,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16297,608 +14815,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512DQ-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-SLOW-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQ-FAST-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQ-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512DQ-FAST-NEXT:    movb $-64, %r11b
 ; AVX512DQ-FAST-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512DQ-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512DQ-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512DQ-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512DQ-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512DQ-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512DQ-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512DQ-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512DQ-FAST-NEXT:    # ymm27 = mem[0,1,0,1]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQ-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512DQ-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQ-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQ-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512DQ-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512DQ-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512DQ-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512DQ-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -16911,19 +15442,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -16935,338 +15467,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512DQ-FAST-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512DQ-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512DQ-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512DQ-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512DQ-FAST-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512DQ-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512DQ-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512DQ-FAST-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512DQ-FAST-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512DQ-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -17275,8 +15777,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -17285,8 +15787,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -17295,8 +15797,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -17305,8 +15807,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -17315,608 +15817,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512DQ-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FAST-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64:
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
-; AVX512BW-ONLY-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-SLOW-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512BW-ONLY-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512BW-ONLY-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -17929,19 +16444,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -17953,338 +16469,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512BW-ONLY-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18293,8 +16779,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18303,8 +16789,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18313,8 +16799,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18323,8 +16809,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -18333,608 +16819,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512BW-ONLY-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-SLOW-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512BW-ONLY-SLOW-NEXT:    vzeroupper
 ; AVX512BW-ONLY-SLOW-NEXT:    retq
 ;
 ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64:
 ; AVX512BW-ONLY-FAST:       # %bb.0:
-; AVX512BW-ONLY-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-FAST-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    movb $-64, %r11b
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %r11d, %k1
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512BW-ONLY-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512BW-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512BW-ONLY-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512BW-ONLY-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -18947,19 +17446,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -18971,338 +17471,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512BW-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512BW-ONLY-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512BW-ONLY-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512BW-ONLY-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -19311,8 +17781,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -19321,8 +17791,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -19331,8 +17801,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -19341,8 +17811,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -19351,608 +17821,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512BW-ONLY-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512BW-ONLY-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-ONLY-FAST-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512BW-ONLY-FAST-NEXT:    vzeroupper
 ; AVX512BW-ONLY-FAST-NEXT:    retq
 ;
 ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64:
 ; AVX512DQBW-SLOW:       # %bb.0:
-; AVX512DQBW-SLOW-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-SLOW-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512DQBW-SLOW-NEXT:    movb $-64, %r11b
 ; AVX512DQBW-SLOW-NEXT:    kmovd %r11d, %k1
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512DQBW-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512DQBW-SLOW-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512DQBW-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512DQBW-SLOW-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512DQBW-SLOW-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512DQBW-SLOW-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512DQBW-SLOW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512DQBW-SLOW-NEXT:    # ymm27 = mem[0,1,0,1]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-SLOW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512DQBW-SLOW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-SLOW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQBW-SLOW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512DQBW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -19965,19 +18448,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -19989,338 +18473,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQBW-SLOW-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512DQBW-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512DQBW-SLOW-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512DQBW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQBW-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -20329,8 +18783,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -20339,8 +18793,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -20349,8 +18803,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -20359,8 +18813,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -20369,608 +18823,621 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512DQBW-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQBW-SLOW-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-SLOW-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512DQBW-SLOW-NEXT:    vzeroupper
 ; AVX512DQBW-SLOW-NEXT:    retq
 ;
 ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64:
 ; AVX512DQBW-FAST:       # %bb.0:
-; AVX512DQBW-FAST-NEXT:    subq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-FAST-NEXT:    subq $5576, %rsp # imm = 0x15C8
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovaps 128(%rdi), %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdi), %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rsi), %zmm25
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rsi), %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rdx), %zmm9
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm24
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rcx), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r8), %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r8), %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r9), %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r9), %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%r10), %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%r10), %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 (%rax), %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rax), %zmm24
 ; AVX512DQBW-FAST-NEXT:    movb $-64, %r11b
 ; AVX512DQBW-FAST-NEXT:    kmovd %r11d, %k1
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm14, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12]
-; AVX512DQBW-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm15, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12]
+; AVX512DQBW-FAST-NEXT:    # ymm13 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm13, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm12, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm0
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13]
-; AVX512DQBW-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13]
+; AVX512DQBW-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm27
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm13, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm13, %zmm10
-; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14]
-; AVX512DQBW-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm5, %zmm13
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm0, %zmm0
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14]
+; AVX512DQBW-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm15, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm0, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15]
 ; AVX512DQBW-FAST-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm0, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15]
 ; AVX512DQBW-FAST-NEXT:    # ymm27 = mem[0,1,0,1]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm27, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm27, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm10, %zmm8
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm11, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm12
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm12, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm5, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm5, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm5, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm24, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, %zmm10
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm9, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r10), %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%r10), %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rax), %zmm24
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm0, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm5, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm12, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdx), %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rcx), %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm5, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm12, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm5
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm1, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm10, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm11, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm23, %zmm0, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r10), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r10), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rax), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm10
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm5
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rcx), %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm12, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm11, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r10), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rax), %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm14, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r8), %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r9), %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r10), %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rax), %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm12, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r8), %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%r9), %zmm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdx), %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rcx), %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm4, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm12, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm10, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm10, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm8
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm13, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm11, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm11, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm0, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm27, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm15, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %ymm4, %ymm24
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm13, %zmm6
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm3, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm12, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm13, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm14, %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm15, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm27, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rcx), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm13, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm15, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdx), %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rcx), %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm13
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm14
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm3, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm27, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r10), %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rax), %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r8), %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r9), %zmm15
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm24, %zmm9, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r10), %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rax), %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r8), %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%r9), %zmm16
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm6, %zmm15, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm12, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm4, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm11, %zmm3
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm11, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm5, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, %zmm5
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm10, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm6, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm0, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rsi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm14, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm11, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm12, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm10, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r10), %zmm29
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r10), %zmm31
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rax), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm12, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm4, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r8), %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r9), %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm31
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm11, %zmm31
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r10), %zmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rax), %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r8), %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r9), %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r10), %zmm30
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rax), %zmm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm12, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rsi), %zmm1
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm4, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm4, %zmm2
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r8), %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r9), %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm11, %zmm30
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm5, %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r8), %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r9), %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm24
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm24
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm27
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm0, %zmm27
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8]
-; AVX512DQBW-FAST-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm5, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm0, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm5
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, %zmm1
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm0, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8]
 ; AVX512DQBW-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9]
+; AVX512DQBW-FAST-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10]
 ; AVX512DQBW-FAST-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11]
 ; AVX512DQBW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm19, %zmm1, %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm14, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm29
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm14, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm21, %zmm1, %zmm28
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm5, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm14, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm2, %zmm28
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm26, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm19
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm14, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm2, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm24, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm2, %zmm21
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm14
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm14, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm2, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm17, %zmm1, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm14, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm11
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm1, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm5, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm6, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm14, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm2, %zmm4
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm22, %zmm1, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm25, %zmm1, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm2, %zmm10
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm4, %zmm1, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm4
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm14, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm2, %zmm13
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm18, %zmm1, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm14, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm22
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm5, %zmm22
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm6, %zmm23
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm2, %zmm7
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm15, %zmm1, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm24
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm6, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm25
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm14, %zmm25
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm2, %zmm8
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm16, %zmm1, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm6, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm20
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm15
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm5, %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm16
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm6, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm2, %zmm18
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm14, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm0
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm26
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm1, %zmm31
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm17
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm6, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm20
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm14, %zmm20
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm2, %zmm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm6, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm14, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm2, %zmm12
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm9, %zmm1, %zmm28
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm5
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm6
-; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm8, %zmm17, %zmm2
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7]
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm1, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6]
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm2, %zmm16
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm11, %zmm1, %zmm30
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm6
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm14
+; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm12, %zmm19, %zmm2
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7]
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm1, %zmm19
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm3 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
@@ -20983,19 +19450,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm31, %zmm1
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm7, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vinsertf64x4 $0, %ymm1, %zmm3, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm9, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -21007,338 +19475,308 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm10, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQBW-FAST-NEXT:    # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm29 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm8
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm29, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %ymm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %ymm4
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm8, %zmm19, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm28 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %ymm0
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %ymm1
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %ymm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm28, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rsi), %ymm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm9
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm14, %zmm3
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rsi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm21, %zmm3
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3]
 ; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm4, %zmm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm15 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdx), %ymm1
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rsi), %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm11, %zmm24
+; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rdi), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm15, %zmm29
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3]
 ; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm21
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rcx), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdx), %ymm1
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rsi), %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdi), %ymm8
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm10, %zmm19
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm9 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm13, %zmm3
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3]
 ; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm10
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm8 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdx), %ymm3
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rsi), %ymm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdi), %ymm14
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm7, %zmm8
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm13 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm13
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdx), %ymm4
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rsi), %ymm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdi), %ymm15
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm9, %zmm8, %zmm8
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm22, %zmm4
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm18 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdx), %ymm3
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rsi), %ymm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdi), %ymm9
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm18, %zmm18
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, %zmm25 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm25, %zmm25
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm2 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdx), %ymm9
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rsi), %ymm11
+; AVX512DQBW-FAST-NEXT:    vmovdqa 384(%rdi), %ymm12
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm10, %zmm18, %zmm10
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm31, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm27, %zmm9
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm2 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rcx), %ymm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdx), %ymm3
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rsi), %ymm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdi), %ymm9
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm4, %zmm2, %zmm4
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdx), %ymm7
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2]
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rsi), %ymm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 448(%rdi), %ymm13
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm11, %zmm2, %zmm11
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3]
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
 ; AVX512DQBW-FAST-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm28, %zmm17 {%k1}
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm17, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rcx), %xmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdx), %xmm3
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm14, %ymm1
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm11, %zmm11
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rcx), %xmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdx), %xmm12
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 64(%rdi), %xmm17
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm3, %ymm20, %ymm3
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm3, %zmm1, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm1, %zmm2
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rcx), %xmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rdx), %xmm14
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rsi), %xmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm26
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm17, %ymm28, %ymm17
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm17, %zmm1, %zmm17
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1]
-; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm14, %ymm12
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm1, %zmm12
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%rcx), %xmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdx), %xmm20
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm28
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm29
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm26, %ymm30, %ymm26
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm26, %zmm1, %zmm26
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm14, %ymm20, %ymm14
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm14, %zmm1, %zmm14
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm19, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rcx), %xmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdx), %xmm28
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm30
-; AVX512DQBW-FAST-NEXT:    vmovdqa 256(%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm29, %ymm31, %ymm29
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm29, %zmm1, %zmm29
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm27, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rcx), %xmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdx), %xmm28
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm31
-; AVX512DQBW-FAST-NEXT:    vmovdqa 320(%rdi), %xmm0
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm30, %ymm27, %ymm27
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm22, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm13 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm2
+; AVX512DQBW-FAST-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, (%rdx), %ymm1, %ymm12
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm13, %zmm1
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm15, %zmm2
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm18 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rsi), %xmm12
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 64(%rcx), %ymm12, %ymm12
+; AVX512DQBW-FAST-NEXT:    vmovdqa 64(%rdi), %xmm13
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 64(%rdx), %ymm13, %ymm13
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm15, %zmm18, %zmm15
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm19 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm12, %zmm19, %zmm12
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, %zmm23 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa 128(%rsi), %xmm13
+; AVX512DQBW-FAST-NEXT:    vinserti128 $1, 128(%rcx), %ymm13, %ymm13
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 128(%rdi), %xmm18
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm19, %zmm23, %zmm19
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm23 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm0
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm23, %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm15 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rcx), %xmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdx), %xmm23
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm27 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm13, %zmm27, %zmm13
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, %zmm5 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rsi), %xmm18
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%rdi), %xmm23
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm27
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm30, %zmm31 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm18, %zmm31, %zmm18
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rsi), %xmm23
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 256(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm16, %zmm26
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm16 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm23, %zmm16, %zmm23
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm24 {%k1}
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 320(%rdi), %xmm30
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm24, %zmm21
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm25 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm25, %zmm22
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm17 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%rdi), %xmm30
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm15, %zmm15
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm16 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm16, %zmm16
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm5 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rcx), %xmm20
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdx), %xmm23
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0]
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm17, %zmm17
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm20 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm20, %zmm16
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm6 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rsi), %xmm28
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%rdi), %xmm30
-; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm31, %ymm27
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm27, %zmm5, %zmm5
-; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm6 {%k1}
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1]
-; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1]
-; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, %xmm20, %ymm23, %ymm20
-; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm20, %zmm6, %zmm6
+; AVX512DQBW-FAST-NEXT:    vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30
+; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm31, %zmm6, %zmm6
+; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQBW-FAST-NEXT:    vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3]
+; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm28, %zmm14, %zmm5
 ; AVX512DQBW-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 3776(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 3712(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm25, 3264(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 3200(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 2752(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 3776(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, 3712(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, 3264(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 3200(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm4, 2752(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, 2688(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm10, 2240(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 2176(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 1728(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm24, 1664(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 1216(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 1152(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 704(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 640(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 192(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 128(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 4032(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3968(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3904(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3840(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 3648(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 3584(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3520(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3456(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3392(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3328(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 2240(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 2176(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1728(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 1664(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1216(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1152(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 192(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 128(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 4032(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3968(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3904(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3840(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm5, 3648(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm6, 3584(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3520(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3456(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3392(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3328(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm16, 3136(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 3072(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 3008(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 2944(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 2880(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQBW-FAST-NEXT:    vmovaps %zmm4, 2816(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, 2624(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 2560(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 3072(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 3008(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2944(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2880(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2816(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm22, 2624(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm21, 2560(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2496(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -21347,8 +19785,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2368(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 2304(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, 2112(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm29, 2048(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm23, 2112(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 2048(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1984(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -21357,8 +19795,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1856(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1792(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm14, 1600(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm26, 1536(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm18, 1600(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, 1536(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1472(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -21367,8 +19805,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1344(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 1280(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 1088(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, 1024(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm19, 1024(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 960(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -21377,8 +19815,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 832(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 768(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 576(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, 512(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm12, 576(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm15, 512(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 448(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -21387,9 +19825,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 320(%rax)
 ; AVX512DQBW-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vmovaps %zmm0, 256(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQBW-FAST-NEXT:    addq $5384, %rsp # imm = 0x1508
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQBW-FAST-NEXT:    addq $5576, %rsp # imm = 0x15C8
 ; AVX512DQBW-FAST-NEXT:    vzeroupper
 ; AVX512DQBW-FAST-NEXT:    retq
   %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index cbd0e201c20603..22f7707ca2b470 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -198,14 +198,14 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa (%r8), %xmm2
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX2-ONLY-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-ONLY-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
+; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
@@ -222,14 +222,14 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512F-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX512F-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512F-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
@@ -246,14 +246,14 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; AVX512BW-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512BW-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 69fb5834962d54..cfb971dd162170 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -208,16 +208,16 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
-; AVX2-NEXT:    vmovdqa (%r8), %xmm2
-; AVX2-NEXT:    vmovdqa (%r11), %xmm3
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX2-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX2-NEXT:    vmovdqa (%rcx), %xmm3
+; AVX2-NEXT:    vinserti128 $1, (%r11), %ymm3, %ymm3
+; AVX2-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm2
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
+; AVX2-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll
index c2be962ac3fb60..401118af26259f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll
@@ -643,21 +643,15 @@ define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64>
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
 ; AVX2-NEXT:    vpaddb %ymm7, %ymm3, %ymm3
 ; AVX2-NEXT:    vpsadbw %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm2[1],xmm3[1]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX2-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm1[1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: reduce_ctpop_v4i64_buildvector_v4i64:
@@ -694,21 +688,15 @@ define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64>
 ; AVX512VL-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
 ; AVX512VL-NEXT:    vpaddb %ymm7, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpsadbw %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm2[1],xmm3[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512VL-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512VL-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3]
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VPOPCNT-LABEL: reduce_ctpop_v4i64_buildvector_v4i64:
@@ -717,21 +705,15 @@ define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64>
 ; AVX512VPOPCNT-NEXT:    vpopcntq %ymm1, %ymm1
 ; AVX512VPOPCNT-NEXT:    vpopcntq %ymm2, %ymm2
 ; AVX512VPOPCNT-NEXT:    vpopcntq %ymm3, %ymm3
-; AVX512VPOPCNT-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX512VPOPCNT-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VPOPCNT-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512VPOPCNT-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; AVX512VPOPCNT-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm2[1],xmm3[1]
-; AVX512VPOPCNT-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512VPOPCNT-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
-; AVX512VPOPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512VPOPCNT-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
-; AVX512VPOPCNT-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm1[1]
-; AVX512VPOPCNT-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512VPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VPOPCNT-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3]
+; AVX512VPOPCNT-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512VPOPCNT-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNT-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3]
 ; AVX512VPOPCNT-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VPOPCNT-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNT-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNT-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512VPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VPOPCNT-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX512VPOPCNT-NEXT:    retq
   %p0 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0)
   %p1 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a1)


        


More information about the llvm-commits mailing list